Repository: woodfrog/maptracker
Branch: main
Commit: ee8321fa5dac
Files: 161
Total size: 1.1 MB

Directory structure:
gitextract_kr8oe7q1/

├── .gitignore
├── LICENSE
├── LICENSE_GPL
├── README.md
├── docs/
│   ├── data_preparation.md
│   ├── getting_started.md
│   └── installation.md
├── plugin/
│   ├── __init__.py
│   ├── configs/
│   │   ├── _base_/
│   │   │   ├── datasets/
│   │   │   │   ├── coco_instance.py
│   │   │   │   ├── kitti-3d-3class.py
│   │   │   │   ├── kitti-3d-car.py
│   │   │   │   ├── lyft-3d.py
│   │   │   │   ├── nuim_instance.py
│   │   │   │   ├── nus-3d.py
│   │   │   │   ├── nus-mono3d.py
│   │   │   │   ├── range100_lyft-3d.py
│   │   │   │   ├── s3dis_seg-3d-13class.py
│   │   │   │   ├── scannet-3d-18class.py
│   │   │   │   ├── scannet_seg-3d-20class.py
│   │   │   │   ├── sunrgbd-3d-10class.py
│   │   │   │   ├── waymoD5-3d-3class.py
│   │   │   │   └── waymoD5-3d-car.py
│   │   │   ├── default_runtime.py
│   │   │   ├── models/
│   │   │   │   ├── 3dssd.py
│   │   │   │   ├── cascade_mask_rcnn_r50_fpn.py
│   │   │   │   ├── centerpoint_01voxel_second_secfpn_nus.py
│   │   │   │   ├── centerpoint_02pillar_second_secfpn_nus.py
│   │   │   │   ├── fcos3d.py
│   │   │   │   ├── groupfree3d.py
│   │   │   │   ├── h3dnet.py
│   │   │   │   ├── hv_pointpillars_fpn_lyft.py
│   │   │   │   ├── hv_pointpillars_fpn_nus.py
│   │   │   │   ├── hv_pointpillars_fpn_range100_lyft.py
│   │   │   │   ├── hv_pointpillars_secfpn_kitti.py
│   │   │   │   ├── hv_pointpillars_secfpn_waymo.py
│   │   │   │   ├── hv_second_secfpn_kitti.py
│   │   │   │   ├── hv_second_secfpn_waymo.py
│   │   │   │   ├── imvotenet_image.py
│   │   │   │   ├── mask_rcnn_r50_fpn.py
│   │   │   │   ├── paconv_cuda_ssg.py
│   │   │   │   ├── paconv_ssg.py
│   │   │   │   ├── parta2.py
│   │   │   │   ├── pointnet2_msg.py
│   │   │   │   ├── pointnet2_ssg.py
│   │   │   │   └── votenet.py
│   │   │   └── schedules/
│   │   │       ├── cosine.py
│   │   │       ├── cyclic_20e.py
│   │   │       ├── cyclic_40e.py
│   │   │       ├── mmdet_schedule_1x.py
│   │   │       ├── schedule_2x.py
│   │   │       ├── schedule_3x.py
│   │   │       ├── seg_cosine_150e.py
│   │   │       ├── seg_cosine_200e.py
│   │   │       └── seg_cosine_50e.py
│   │   └── maptracker/
│   │       ├── av2_newsplit/
│   │       │   ├── maptracker_av2_100x50_newsplit_5frame_span10_stage1_bev_pretrain.py
│   │       │   ├── maptracker_av2_100x50_newsplit_5frame_span10_stage2_warmup.py
│   │       │   ├── maptracker_av2_100x50_newsplit_5frame_span10_stage3_joint_finetune.py
│   │       │   ├── maptracker_av2_newsplit_5frame_span10_stage1_bev_pretrain.py
│   │       │   ├── maptracker_av2_newsplit_5frame_span10_stage2_warmup.py
│   │       │   └── maptracker_av2_newsplit_5frame_span10_stage3_joint_finetune.py
│   │       ├── av2_oldsplit/
│   │       │   ├── maptracker_av2_oldsplit_5frame_span10_stage1_bev_pretrain.py
│   │       │   ├── maptracker_av2_oldsplit_5frame_span10_stage2_warmup.py
│   │       │   └── maptracker_av2_oldsplit_5frame_span10_stage3_joint_finetune.py
│   │       ├── nuscenes_newsplit/
│   │       │   ├── maptracker_nusc_newsplit_5frame_span10_stage1_bev_pretrain.py
│   │       │   ├── maptracker_nusc_newsplit_5frame_span10_stage2_warmup.py
│   │       │   └── maptracker_nusc_newsplit_5frame_span10_stage3_joint_finetune.py
│   │       └── nuscenes_oldsplit/
│   │           ├── maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain.py
│   │           ├── maptracker_nusc_oldsplit_5frame_span10_stage2_warmup.py
│   │           └── maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py
│   ├── core/
│   │   ├── apis/
│   │   │   ├── __init__.py
│   │   │   ├── mmdet_train.py
│   │   │   ├── test.py
│   │   │   └── train.py
│   │   └── evaluation/
│   │       ├── __init__.py
│   │       └── eval_hooks.py
│   ├── datasets/
│   │   ├── __init__.py
│   │   ├── argo_dataset.py
│   │   ├── base_dataset.py
│   │   ├── builder.py
│   │   ├── evaluation/
│   │   │   ├── AP.py
│   │   │   ├── __init__.py
│   │   │   ├── distance.py
│   │   │   ├── raster_eval.py
│   │   │   └── vector_eval.py
│   │   ├── map_utils/
│   │   │   ├── av2map_extractor.py
│   │   │   ├── nuscmap_extractor.py
│   │   │   └── utils.py
│   │   ├── nusc_dataset.py
│   │   ├── pipelines/
│   │   │   ├── __init__.py
│   │   │   ├── formating.py
│   │   │   ├── loading.py
│   │   │   ├── rasterize.py
│   │   │   ├── transform.py
│   │   │   └── vectorize.py
│   │   ├── samplers/
│   │   │   ├── __init__.py
│   │   │   ├── distributed_sampler.py
│   │   │   ├── group_sampler.py
│   │   │   └── sampler.py
│   │   └── visualize/
│   │       └── renderer.py
│   └── models/
│       ├── __init__.py
│       ├── assigner/
│       │   ├── __init__.py
│       │   ├── assigner.py
│       │   └── match_cost.py
│       ├── backbones/
│       │   ├── __init__.py
│       │   ├── bevformer/
│       │   │   ├── __init__.py
│       │   │   ├── custom_base_transformer_layer.py
│       │   │   ├── encoder.py
│       │   │   ├── grid_mask.py
│       │   │   ├── multi_scale_deformable_attn_function.py
│       │   │   ├── spatial_cross_attention.py
│       │   │   ├── temporal_net.py
│       │   │   ├── temporal_self_attention.py
│       │   │   └── transformer.py
│       │   └── bevformer_backbone.py
│       ├── heads/
│       │   ├── MapDetectorHead.py
│       │   ├── MapSegHead.py
│       │   ├── __init__.py
│       │   └── base_map_head.py
│       ├── losses/
│       │   ├── __init__.py
│       │   ├── detr_loss.py
│       │   └── seg_loss.py
│       ├── mapers/
│       │   ├── MapTracker.py
│       │   ├── __init__.py
│       │   ├── base_mapper.py
│       │   └── vector_memory.py
│       ├── necks/
│       │   ├── __init__.py
│       │   └── gru.py
│       ├── transformer_utils/
│       │   ├── CustomMSDeformableAttention.py
│       │   ├── MapTransformer.py
│       │   ├── __init__.py
│       │   ├── base_transformer.py
│       │   ├── deformable_transformer.py
│       │   └── fp16_dattn.py
│       └── utils/
│           ├── __init__.py
│           ├── query_update.py
│           └── renderer_track.py
├── requirements.txt
└── tools/
    ├── benchmark.py
    ├── data_converter/
    │   ├── __init__.py
    │   ├── argoverse_converter.py
    │   ├── av2_train_split.txt
    │   ├── av2_val_split.txt
    │   ├── nusc_split.py
    │   └── nuscenes_converter.py
    ├── dist_test.sh
    ├── dist_train.sh
    ├── mmdet_test.py
    ├── mmdet_train.py
    ├── slurm_test.sh
    ├── slurm_train.sh
    ├── test.py
    ├── tracking/
    │   ├── calculate_cmap.py
    │   ├── cmap_utils/
    │   │   ├── __init__.py
    │   │   ├── data_utils.py
    │   │   ├── match_utils.py
    │   │   └── utils.py
    │   ├── prepare_gt_tracks.py
    │   └── prepare_pred_tracks.py
    ├── train.py
    └── visualization/
        ├── vis_global.py
        └── vis_per_frame.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
*.ipynb

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# cython generated cpp
data
.vscode
.idea

# custom
*.pkl
*.gif
*.pkl.json
*.log.json
work_dirs/
work_dirs_bak/
debug_img/
model_file/
exps/
*~
mmdet3d/.mim

# Pytorch
*.pth

# demo
demo/
*.jpg
*.png
*.obj
*.ply
*.zip
*.tar
*.tar.gz
*.json

# datasets
/datasets
/data_ann
/datasets_local

# softlinks
av2
nuScenes

# viz
viz
viz_bak

*pkl*

demo
mmdetection3d
work_dirs
vis_global
vis_local


================================================
FILE: LICENSE
================================================
The code, data, and model weights in this repository are not allowed for commercial usage. For research purposes, the terms follow the GPLv3 as in the separate file "LICENSE_GPL". 

-- Authors of the paper "MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD Mapping".


================================================
FILE: LICENSE_GPL
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.

================================================
FILE: README.md
================================================
<div align="center">
<h2 align="center"> MapTracker: Tracking with Strided Memory Fusion for <br/> Consistent Vector HD Mapping </h1>

<h4 align="center"> ECCV 2024 (Oral) </h4>


[Jiacheng Chen*<sup>1</sup>](https://jcchen.me) , [Yuefan Wu*<sup>1</sup>](https://ivenwu.com/) , [Jiaqi Tan*<sup>1</sup>](https://www.linkedin.com/in/jiaqi-christina-tan-800697158/), [Hang Ma<sup>1</sup>](https://www.cs.sfu.ca/~hangma/), [Yasutaka Furukawa<sup>1,2</sup>](https://www2.cs.sfu.ca/~furukawa/)

<sup>1</sup> Simon Fraser University <sup>2</sup> Wayve


([arXiv](https://arxiv.org/abs/2403.15951), [Project page](https://map-tracker.github.io/))

</div>


https://github.com/woodfrog/maptracker/assets/13405255/1c0e072a-cb77-4000-b81b-5b9fd40f8f39


This repository provides the official implementation of the paper [MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD Mapping](https://arxiv.org/abs/2403.15951). MapTracker reconstructs temporally consistent vector HD maps, and the local maps can be progressively merged into a global reconstruction.

This repository is built upon [StreamMapNet](https://github.com/yuantianyuan01/StreamMapNet). 


## Table of Contents
- [Introduction](#introduction)
- [Model Architecture](#model-architecture)
- [Installation](#installation)
- [Data preparation](#data-preparation)
- [Getting Started](#getting-started)
- [Acknowledgements](#acknowledgements)
- [Citation](#citation)
- [License](#license)

## Introduction
This paper presents a vector HD-mapping algorithm that formulates the mapping as a tracking task and uses a history of memory latents to ensure consistent reconstructions over time.

Our method, MapTracker, accumulates a sensor stream into memory buffers of two latent representations: 1) Raster latents in the bird's-eye-view (BEV) space and 2) Vector latents over the road elements (i.e., pedestrian-crossings, lane-dividers, and road-boundaries). The approach borrows the query propagation paradigm from the tracking literature that explicitly associates tracked road elements from the previous frame to the current, while fusing a subset of memory latents selected with distance strides to further enhance temporal consistency. A vector latent is decoded to reconstruct the geometry of a road element.

The paper further makes benchmark contributions by 1) Improving processing code for existing datasets to produce consistent ground truth with temporal alignments and 2) Augmenting existing mAP metrics with consistency checks. MapTracker significantly outperforms existing methods on both nuScenes and Agroverse2 datasets by over 8% and 19% on the conventional and the new consistency-aware metrics, respectively.


## Model Architecture

![visualization](docs/fig/arch.png)

(Top) The architecture of MapTracker, consistsing of the BEV and VEC Modules and their memory buffers. (Bottom) The close-up views of the BEV and the vector fusion layers.

The **BEV Module** takes ConvNet features of onboard perspective images, the BEV memory buffer ${M_{\text{BEV}}(t-1), M_{\text{BEV}}(t-2),\ ... }$ and vehicle motions ${P^t_{t-1}, P^t_{t-2},\ ... }$ as input. It propagates the previous BEV memory $M_{\text{BEV}}(t-1)$ based on vehicle motion to initialize $M_{\text{BEV}}(t)$. In the BEV Memory Fusion layer, $M_{\text{BEV}}(t)$ is integrated with selected history BEV memories $\{M_{\text{BEV}}^{*}(t'), t'\in \pi(t)\}$, which is used for semantic segmentation and passed to the VEC Module.

The **VEC Module** propagates the previous latent vector memory $M_{\text{VEC}}(t-1)$ with a PropMLP to initialize the vector queries $M_{\text{VEC}}(t)$. In Vector Memory Fusion layer, each propagated $M_{\text{VEC}}(t)$ is fused with its selected history vector memories $\{M_{\text{VEC}}^{*}(t'), t' \in \pi(t)\}$. The final vector latents are decoded to reconstruct the road elements.


## Installation

Please refer to the [installation guide](docs/installation.md) to set up the environment.


## Data preparation

For how to download and prepare data for the nuScenes and Argoverse2 datasets, as well as downloading our checkpoints, please see the [data preparation guide](docs/data_preparation.md). 


## Getting Started

For instructions on how to run training, inference, evaluation, and visualization, please follow [getting started guide](docs/getting_started.md).


## Acknowledgements

We're grateful to the open-source projects below, their great work made our project possible:

* BEV perception: [BEVFormer](https://github.com/fundamentalvision/BEVFormer) ![GitHub stars](https://img.shields.io/github/stars/fundamentalvision/BEVFormer.svg?style=flat&label=Star)
* Vector HD mapping: [StreamMapNet](https://github.com/yuantianyuan01/StreamMapNet) ![GitHub stars](https://img.shields.io/github/stars/yuantianyuan01/StreamMapNet.svg?style=flat&label=Star), [MapTR](https://github.com/hustvl/MapTR) ![GitHub stars](https://img.shields.io/github/stars/hustvl/MapTR.svg?style=flat&label=Star)


## Citation

If you find MapTracker useful in your research or applications, please consider citing:

```
@inproceedings{chen2024maptrakcer,
  author  = {Chen, Jiacheng and Wu, Yuefan and Tan, Jiaqi and Ma, Hang and Furukawa, Yasutaka},
  title   = {MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD Mapping},
  journal = {arXiv preprint arXiv:2403.15951},
  year    = {2024}
}
```

## License

This project is licensed under GPL, see the [license file](LICENSE) for details.


================================================
FILE: docs/data_preparation.md
================================================

# Data Preparation

Compared to the data preparation procedure of StreamMapNet or MapTR, we have one more step to generate the ground truth tracking information (Step 3). 

We noticed that the track generation results can be slighly different when running on different machines (potentially because Shapely's behaviors are slightly different across different machines), **so please always run the Step 3 below on the training machine to generate the gt tracking information**. 

## nuScenes
**Step 1.** Download [nuScenes](https://www.nuscenes.org/download) dataset to `./datasets/nuscenes`.


**Step 2.** Generate annotation files for NuScenes dataset (the same as StreamMapNet)

```
python tools/data_converter/nuscenes_converter.py --data-root ./datasets/nuscenes
```

Add ``--newsplit`` to generate the metadata for the new split (geographical-based split) provided by StreamMapNet.

**Step 3.** Generate the tracking ground truth by 

```
python tools/tracking/prepare_gt_tracks.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py  --out-dir tracking_gts/nuscenes --visualize
```

Add the ``--visualize`` flag to visualize the data with element IDs derived from our track generation process, or remove it to save disk memory.  

For generating the G.T. tracks of the new split, change the config file accordingly.


## Argoverse2

**Step 1.** Download [Argoverse2 (sensor)](https://argoverse.github.io/user-guide/getting_started.html#download-the-datasets) dataset to `./datasets/av2`.

**Step 2.** Generate annotation files for Argoverse2 dataset.

```
python tools/data_converter/argoverse_converter.py --data-root ./datasets/av2
```

**Step 3.** Generate the tracking ground truth by 

```
python tools/tracking/prepare_gt_tracks.py plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage3_joint_finetune.py  --out-dir tracking_gts/av2 --visualize
```


## Checkpoints

We provide the checkpoints at [this Dropbox link](https://www.dropbox.com/scl/fo/miulg8q9oby7q2x5vemme/ALoxX1HyxGlfR9y3xlqfzeE?rlkey=i3rw4mbq7lacblc7xsnjkik1u&dl=0) or [this HuggingFace repo](https://huggingface.co/cccjc/maptracker/tree/main). Please download and place them as ``./work_dirs/pretrained_ckpts``.


## File structures

Make sure the final file structures look like below:

```
maptracker
├── mmdetection3d
├── tools
├── plugin
│   ├── configs
│   ├── models
│   ├── datasets
│   ├── ...
├── work_dirs
│   ├── pretrained_ckpts
│   │   ├── maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune
│   │   │   ├── latest.pth
│   │   ├── ...
│   ├── ....
├── datasets
│   ├── nuscenes
│   │   ├── maps <-- used
│   │   ├── samples <-- key frames
│   │   ├── v1.0-test <-- metadata
|   |   ├── v1.0-trainval <-- metadata and annotations
│   │   ├── nuscenes_map_infos_train_{newsplit}.pkl <-- train annotations
│   │   ├── nuscenes_map_infos_train_{newsplit}_gt_tracks.pkl <-- train gt tracks
│   │   ├── nuscenes_map_infos_val_{newsplit}.pkl <-- val annotations
│   │   ├── nuscenes_map_infos_val_{newsplit}_gt_trakcs.pkl <-- val gt tracks
│   ├── av2
│   │   ├── train
│   │   ├── val
│   │   ├── test
│   │   ├── maptrv2_val_samples_info.pkl <-- maptr's av2 metadata, used to align the val set
│   │   ├── av2_map_infos_train_{newsplit}.pkl <-- train annotations
│   │   ├── av2_map_infos_train_{newsplit}_gt_tracks.pkl <-- train gt tracks
│   │   ├── av2_map_infos_val_{newsplit}.pkl <-- val annotations
│   │   ├── av2_map_infos_val_{newsplit}_gt_trakcs.pkl <-- val gt tracks

```


================================================
FILE: docs/getting_started.md
================================================
# Getting started with MapTracker

In this document, we provide the commands for running inference/evaluation, training, and visualization.


## Inference and evaluation


### Inference and evaluate with Chamfer-based mAP


Run the following command to do inference and evaluation using the pretrained checkpoints, assuming 8 GPUs are used.

```
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7  bash tools/dist_test.sh  plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py    work_dirs/pretrained_ckpts/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/latest.pth  8  --eval --eval-options save_semantic=True
```

Set the ``--eval-options save_semantic=True`` to also save the semantic segmentation results of the BEV module.


### Evaluate with C-mAP

Generate prediction matching by
```
python tools/tracking/prepare_pred_tracks.py ${CONFIG} --result_path ${SUBMISSION_FILE} --cons_frames ${COMEBACK_FRAMES}
```

Evaluate with C-mAP by
```
python tools/tracking/calculate_cmap.py ${CONFIG} --result_path ${PRED_MATCHING_INFO}
```

An example evaluation:
```
python tools/tracking/calculate_cmap.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py --result_path ./work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/pos_predictions.pkl
```

### Results

By running with the checkpoints we provided in the [data preparation guide](docs/data_preparation.md), the expected results are:

|                          Dataset                               | Split | Divider | Crossing | Boundary | mAP |      C-mAP  |
|:------------------------------------------------------------------------:|:--------:|:-------:|:--------:|:--------:|:---------:|:-------------------------------------------------------------------------------------------:|
|            nuScenes             |  old  |  74.14  |  80.04   |  74.06   |   76.08  | 69.13  |
|            nuScenes             |  new  |  30.10  |  45.86   |  45.06   |   40.34  | 32.50  |
|            Argoverse2           |  old  |  76.99  |  79.97   |  73.66   |   76.87  | 68.35  |
|            Argoverse2           |  new  |  75.11  |  69.96   |  68.95   |   71.34  | 63.11  |


## Training

The training consists of three stages as detailed in the paper. We train the models on 8 Nvidia RTX A5000 GPUs. 

**Stage 1**: BEV pretraining with semantic segmentation losses:
```
bash ./tools/dist_train.sh plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain.py 8
```

**Stage 2**: Vector module warmup with a large batch size while freezing the BEV module:
```
bash ./tools/dist_train.sh plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage2_warmup.py 8
```
Set up the ``load_from=...`` properly in the config file to load the checkpoint from stage 1.

**Stage 3**: Joint finetuning:
```
bash ./tools/dist_train.sh plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py 8
```
Set up the ``load_from=...`` properly in the config file to load the checkpoint from stage 2.


## Visualization

### Global merged reconstruction (merged from local HD maps)

```bash
python tools/visualization/vis_global.py [path to method configuration file under plugin/configs] \
  --data_path [path to the .pkl file] \
  --out_dir [path to the output folder] \
  --option [vis-pred / vis-gt: visualize predicted vectors / visualize ground truth vectors] \
  --per_frame_result 1
```
Set the ``--per_frame_result`` to 1 to generate the per-frame video, the visualization is a bit slow; set it to 0 to only produce the final merged global reconstruction. 


Examples:
```bash
# Visualize MapTracker's prediction
python tools/visualization/vis_global.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \
--data_path work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/pos_predictions.pkl \
--out_dir vis_global/nuscenes_old/maptracker \
--option vis-pred  --per_frame_result 1

# Visualize groud truth data
python tools/visualization/vis_global.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \
--data_path datasets/nuscenes/nuscenes_map_infos_val_gt_tracks.pkl \
--out_dir vis_global/nuscenes_old/gt  \
--option vis-gt --per_frame_result 0
```


### Local HD map reconstruction

```bash
python tools/visualization/vis_per_frame.py [path to method configuration file under plugin/configs] \
  --data_path [path to the .pkl file] \
  --out_dir [path to the data folder] \
  --option [vis-pred / vis-gt: visualize predicted vectors / visualize ground truth vectors and input video streams]
```

Note that the input perspective-view videos will be saved when generating the ground truth visualization.


Examples:
```bash
# Visualize MapTracker's prediction
python tools/visualization/vis_per_frame.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \
--data_path work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/pos_predictions.pkl \
--out_dir vis_local/nuscenes_old/maptracker \
--option vis-pred

# Visualize groud truth data
python tools/visualization/vis_per_frame.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \
--data_path datasets/nuscenes/nuscenes_map_infos_val_gt_tracks.pkl \
--out_dir vis_local/nuscenes_old/gt  \
--option vis-gt
```


================================================
FILE: docs/installation.md
================================================
# Environment Setup

We use the same environment as StreamMapNet and the environment setup is largely borrowed from their repo.

**Step 1.** Create conda environment and activate:

```
conda create --name maptracker python=3.8 -y
conda activate maptracker
```

**Step 2.** Install PyTorch.

```
pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
```

**Step 3.** Install MMCV series.

```
# Install mmcv-series
pip install mmcv-full==1.6.0
pip install mmdet==2.28.2
pip install mmsegmentation==0.30.0
git clone https://github.com/open-mmlab/mmdetection3d.git
cd mmdetection3d
git checkout v1.0.0rc6 
pip install -e .
```

**Step 4.** Install other requirements.

```
pip install -r requirements.txt
```


================================================
FILE: plugin/__init__.py
================================================
from .models import *
from .datasets import *

================================================
FILE: plugin/configs/_base_/datasets/coco_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(metric=['bbox', 'segm'])


================================================
FILE: plugin/configs/_base_/datasets/kitti-3d-3class.py
================================================
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
    data_root=data_root,
    info_path=data_root + 'kitti_dbinfos_train.pkl',
    rate=1.0,
    prepare=dict(
        filter_by_difficulty=[-1],
        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
    classes=class_names,
    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))

file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))

train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=4,
        use_dim=4,
        file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=True,
        with_label_3d=True,
        file_client_args=file_client_args),
    dict(type='ObjectSample', db_sampler=db_sampler),
    dict(
        type='ObjectNoise',
        num_try=100,
        translation_std=[1.0, 1.0, 0.5],
        global_rot_range=[0.0, 0.0],
        rot_range=[-0.78539816, 0.78539816]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.78539816, 0.78539816],
        scale_ratio_range=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=4,
        use_dim=4,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=4,
        use_dim=4,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=6,
    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            ann_file=data_root + 'kitti_infos_train.pkl',
            split='training',
            pts_prefix='velodyne_reduced',
            pipeline=train_pipeline,
            modality=input_modality,
            classes=class_names,
            test_mode=False,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
            box_type_3d='LiDAR')),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'kitti_infos_val.pkl',
        split='training',
        pts_prefix='velodyne_reduced',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'kitti_infos_val.pkl',
        split='training',
        pts_prefix='velodyne_reduced',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'))

evaluation = dict(interval=1, pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/kitti-3d-car.py
================================================
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
    data_root=data_root,
    info_path=data_root + 'kitti_dbinfos_train.pkl',
    rate=1.0,
    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
    classes=class_names,
    sample_groups=dict(Car=15))

file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))

train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=4,
        use_dim=4,
        file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=True,
        with_label_3d=True,
        file_client_args=file_client_args),
    dict(type='ObjectSample', db_sampler=db_sampler),
    dict(
        type='ObjectNoise',
        num_try=100,
        translation_std=[1.0, 1.0, 0.5],
        global_rot_range=[0.0, 0.0],
        rot_range=[-0.78539816, 0.78539816]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.78539816, 0.78539816],
        scale_ratio_range=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=4,
        use_dim=4,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=4,
        use_dim=4,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=6,
    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            ann_file=data_root + 'kitti_infos_train.pkl',
            split='training',
            pts_prefix='velodyne_reduced',
            pipeline=train_pipeline,
            modality=input_modality,
            classes=class_names,
            test_mode=False,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
            box_type_3d='LiDAR')),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'kitti_infos_val.pkl',
        split='training',
        pts_prefix='velodyne_reduced',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'kitti_infos_val.pkl',
        split='training',
        pts_prefix='velodyne_reduced',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'))

evaluation = dict(interval=1, pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/lyft-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-80, -80, -5, 80, 80, 3]
# For Lyft we usually do 9-class detection
class_names = [
    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
    'bicycle', 'pedestrian', 'animal'
]
dataset_type = 'LyftDataset'
data_root = 'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=True,
    use_camera=False,
    use_radar=False,
    use_map=False,
    use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/lyft/': 's3://lyft/lyft/',
#         'data/lyft/': 's3://lyft/lyft/'
#    }))
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.3925, 0.3925],
        scale_ratio_range=[0.95, 1.05],
        translation_std=[0, 0, 0]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=False),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_test.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/nuim_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/nuimages/'
class_names = [
    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(
        type='Resize',
        img_scale=[(1280, 720), (1920, 1080)],
        multiscale_mode='range',
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1600, 900),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
        img_prefix=data_root,
        classes=class_names,
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
        img_prefix=data_root,
        classes=class_names,
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
        img_prefix=data_root,
        classes=class_names,
        pipeline=test_pipeline))
evaluation = dict(metric=['bbox', 'segm'])


================================================
FILE: plugin/configs/_base_/datasets/nus-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=True,
    use_camera=False,
    use_radar=False,
    use_map=False,
    use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
#     }))
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.3925, 0.3925],
        scale_ratio_range=[0.95, 1.05],
        translation_std=[0, 0, 0]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectNameFilter', classes=class_names),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=4,
    workers_per_gpu=4,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=False,
        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
        box_type_3d='LiDAR'),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True,
        box_type_3d='LiDAR'))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/nus-mono3d.py
================================================
dataset_type = 'NuScenesMonoDataset'
data_root = 'data/nuscenes/'
class_names = [
    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFileMono3D'),
    dict(
        type='LoadAnnotations3D',
        with_bbox=True,
        with_label=True,
        with_attr_label=True,
        with_bbox_3d=True,
        with_label_3d=True,
        with_bbox_depth=True),
    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
        type='Collect3D',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
            'gt_labels_3d', 'centers2d', 'depths'
        ]),
]
test_pipeline = [
    dict(type='LoadImageFromFileMono3D'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=1.0,
        flip=False,
        transforms=[
            dict(type='RandomFlip3D'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['img']),
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(type='LoadImageFromFileMono3D'),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['img'])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
        img_prefix=data_root,
        classes=class_names,
        pipeline=train_pipeline,
        modality=input_modality,
        test_mode=False,
        box_type_3d='Camera'),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
        img_prefix=data_root,
        classes=class_names,
        pipeline=test_pipeline,
        modality=input_modality,
        test_mode=True,
        box_type_3d='Camera'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
        img_prefix=data_root,
        classes=class_names,
        pipeline=test_pipeline,
        modality=input_modality,
        test_mode=True,
        box_type_3d='Camera'))
evaluation = dict(interval=2)


================================================
FILE: plugin/configs/_base_/datasets/range100_lyft-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-100, -100, -5, 100, 100, 3]
# For Lyft we usually do 9-class detection
class_names = [
    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
    'bicycle', 'pedestrian', 'animal'
]
dataset_type = 'LyftDataset'
data_root = 'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=True,
    use_camera=False,
    use_radar=False,
    use_map=False,
    use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/lyft/': 's3://lyft/lyft/',
#         'data/lyft/': 's3://lyft/lyft/'
#    }))
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.3925, 0.3925],
        scale_ratio_range=[0.95, 1.05],
        translation_std=[0, 0, 0]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=False),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_test.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/s3dis_seg-3d-13class.py
================================================
# dataset settings
dataset_type = 'S3DISSegDataset'
data_root = './data/s3dis/'
class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
num_points = 4096
train_area = [1, 2, 3, 4, 6]
test_area = 5
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        use_color=True,
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4, 5]),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=False,
        with_label_3d=False,
        with_mask_3d=False,
        with_seg_3d=True),
    dict(
        type='PointSegClassMapping',
        valid_cat_ids=tuple(range(len(class_names))),
        max_cat_id=13),
    dict(
        type='IndoorPatchPointSample',
        num_points=num_points,
        block_size=1.0,
        ignore_index=len(class_names),
        use_normalized_coord=True,
        enlarge_size=0.2,
        min_unique_num=None),
    dict(type='NormalizePointsColor', color_mean=None),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        use_color=True,
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4, 5]),
    dict(type='NormalizePointsColor', color_mean=None),
    dict(
        # a wrapper in order to successfully call test function
        # actually we don't perform test-time-aug
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(
                type='RandomFlip3D',
                sync_2d=False,
                flip_ratio_bev_horizontal=0.0,
                flip_ratio_bev_vertical=0.0),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
# we need to load gt seg_mask!
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        use_color=True,
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4, 5]),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=False,
        with_label_3d=False,
        with_mask_3d=False,
        with_seg_3d=True),
    dict(
        type='PointSegClassMapping',
        valid_cat_ids=tuple(range(len(class_names))),
        max_cat_id=13),
    dict(
        type='DefaultFormatBundle3D',
        with_label=False,
        class_names=class_names),
    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
]

data = dict(
    samples_per_gpu=8,
    workers_per_gpu=4,
    # train on area 1, 2, 3, 4, 6
    # test on area 5
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_files=[
            data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area
        ],
        pipeline=train_pipeline,
        classes=class_names,
        test_mode=False,
        ignore_index=len(class_names),
        scene_idxs=[
            data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy'
            for i in train_area
        ]),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        ignore_index=len(class_names),
        scene_idxs=data_root +
        f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        ignore_index=len(class_names)))

evaluation = dict(pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/scannet-3d-18class.py
================================================
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
               'garbagebin')
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=True,
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=True,
        with_label_3d=True,
        with_mask_3d=True,
        with_seg_3d=True),
    dict(type='GlobalAlignment', rotation_axis=2),
    dict(
        type='PointSegClassMapping',
        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
                       36, 39),
        max_cat_id=40),
    dict(type='IndoorPointSample', num_points=40000),
    dict(
        type='RandomFlip3D',
        sync_2d=False,
        flip_ratio_bev_horizontal=0.5,
        flip_ratio_bev_vertical=0.5),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.087266, 0.087266],
        scale_ratio_range=[1.0, 1.0],
        shift_height=True),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
        type='Collect3D',
        keys=[
            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
            'pts_instance_mask'
        ])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=True,
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(type='GlobalAlignment', rotation_axis=2),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(
                type='RandomFlip3D',
                sync_2d=False,
                flip_ratio_bev_horizontal=0.5,
                flip_ratio_bev_vertical=0.5),
            dict(type='IndoorPointSample', num_points=40000),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(type='GlobalAlignment', rotation_axis=2),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=8,
    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=5,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            ann_file=data_root + 'scannet_infos_train.pkl',
            pipeline=train_pipeline,
            filter_empty_gt=False,
            classes=class_names,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
            box_type_3d='Depth')),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'scannet_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        box_type_3d='Depth'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'scannet_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        box_type_3d='Depth'))

evaluation = dict(pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/scannet_seg-3d-20class.py
================================================
# dataset settings
dataset_type = 'ScanNetSegDataset'
data_root = './data/scannet/'
class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
               'bathtub', 'otherfurniture')
num_points = 8192
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        use_color=True,
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4, 5]),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=False,
        with_label_3d=False,
        with_mask_3d=False,
        with_seg_3d=True),
    dict(
        type='PointSegClassMapping',
        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
                       33, 34, 36, 39),
        max_cat_id=40),
    dict(
        type='IndoorPatchPointSample',
        num_points=num_points,
        block_size=1.5,
        ignore_index=len(class_names),
        use_normalized_coord=False,
        enlarge_size=0.2,
        min_unique_num=None),
    dict(type='NormalizePointsColor', color_mean=None),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        use_color=True,
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4, 5]),
    dict(type='NormalizePointsColor', color_mean=None),
    dict(
        # a wrapper in order to successfully call test function
        # actually we don't perform test-time-aug
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(
                type='RandomFlip3D',
                sync_2d=False,
                flip_ratio_bev_horizontal=0.0,
                flip_ratio_bev_vertical=0.0),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
# we need to load gt seg_mask!
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        use_color=True,
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4, 5]),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=False,
        with_label_3d=False,
        with_mask_3d=False,
        with_seg_3d=True),
    dict(
        type='PointSegClassMapping',
        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
                       33, 34, 36, 39),
        max_cat_id=40),
    dict(
        type='DefaultFormatBundle3D',
        with_label=False,
        class_names=class_names),
    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
]

data = dict(
    samples_per_gpu=8,
    workers_per_gpu=4,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'scannet_infos_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        test_mode=False,
        ignore_index=len(class_names),
        scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'scannet_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        ignore_index=len(class_names)),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'scannet_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        ignore_index=len(class_names)))

evaluation = dict(pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/sunrgbd-3d-10class.py
================================================
dataset_type = 'SUNRGBDDataset'
data_root = 'data/sunrgbd/'
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
               'night_stand', 'bookshelf', 'bathtub')
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=True,
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(type='LoadAnnotations3D'),
    dict(
        type='RandomFlip3D',
        sync_2d=False,
        flip_ratio_bev_horizontal=0.5,
    ),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.523599, 0.523599],
        scale_ratio_range=[0.85, 1.15],
        shift_height=True),
    dict(type='IndoorPointSample', num_points=20000),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=True,
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(
                type='RandomFlip3D',
                sync_2d=False,
                flip_ratio_bev_horizontal=0.5,
            ),
            dict(type='IndoorPointSample', num_points=20000),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='DEPTH',
        shift_height=False,
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=16,
    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=5,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            ann_file=data_root + 'sunrgbd_infos_train.pkl',
            pipeline=train_pipeline,
            classes=class_names,
            filter_empty_gt=False,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
            box_type_3d='Depth')),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'sunrgbd_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        box_type_3d='Depth'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'sunrgbd_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        test_mode=True,
        box_type_3d='Depth'))

evaluation = dict(pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/waymoD5-3d-3class.py
================================================
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))

class_names = ['Car', 'Pedestrian', 'Cyclist']
point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
    data_root=data_root,
    info_path=data_root + 'waymo_dbinfos_train.pkl',
    rate=1.0,
    prepare=dict(
        filter_by_difficulty=[-1],
        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
    classes=class_names,
    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
    points_loader=dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=[0, 1, 2, 3, 4],
        file_client_args=file_client_args))

train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=6,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=True,
        with_label_3d=True,
        file_client_args=file_client_args),
    dict(type='ObjectSample', db_sampler=db_sampler),
    dict(
        type='RandomFlip3D',
        sync_2d=False,
        flip_ratio_bev_horizontal=0.5,
        flip_ratio_bev_vertical=0.5),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.78539816, 0.78539816],
        scale_ratio_range=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=6,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=6,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            ann_file=data_root + 'waymo_infos_train.pkl',
            split='training',
            pipeline=train_pipeline,
            modality=input_modality,
            classes=class_names,
            test_mode=False,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
            box_type_3d='LiDAR',
            # load one frame every five frames
            load_interval=5)),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'waymo_infos_val.pkl',
        split='training',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'waymo_infos_val.pkl',
        split='training',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'))

evaluation = dict(interval=24, pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/datasets/waymoD5-3d-car.py
================================================
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))

class_names = ['Car']
point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
    data_root=data_root,
    info_path=data_root + 'waymo_dbinfos_train.pkl',
    rate=1.0,
    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
    classes=class_names,
    sample_groups=dict(Car=15),
    points_loader=dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=[0, 1, 2, 3, 4],
        file_client_args=file_client_args))

train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=6,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=True,
        with_label_3d=True,
        file_client_args=file_client_args),
    dict(type='ObjectSample', db_sampler=db_sampler),
    dict(
        type='RandomFlip3D',
        sync_2d=False,
        flip_ratio_bev_horizontal=0.5,
        flip_ratio_bev_vertical=0.5),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.78539816, 0.78539816],
        scale_ratio_range=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=6,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=6,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            ann_file=data_root + 'waymo_infos_train.pkl',
            split='training',
            pipeline=train_pipeline,
            modality=input_modality,
            classes=class_names,
            test_mode=False,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
            box_type_3d='LiDAR',
            # load one frame every five frames
            load_interval=5)),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'waymo_infos_val.pkl',
        split='training',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'waymo_infos_val.pkl',
        split='training',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'))

evaluation = dict(interval=24, pipeline=eval_pipeline)


================================================
FILE: plugin/configs/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
# yapf:disable push
# By default we use textlogger hook and tensorboard
# For more loggers see
# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])
# yapf:enable
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = None
resume_from = None
workflow = [('train', 1)]


================================================
FILE: plugin/configs/_base_/models/3dssd.py
================================================
model = dict(
    type='SSD3DNet',
    backbone=dict(
        type='PointNet2SAMSG',
        in_channels=4,
        num_points=(4096, 512, (256, 256)),
        radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
        num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
        sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
                     ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
                     ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
        aggregation_channels=(64, 128, 256),
        fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
        fps_sample_range_lists=((-1), (-1), (512, -1)),
        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
        sa_cfg=dict(
            type='PointSAModuleMSG',
            pool_mod='max',
            use_xyz=True,
            normalize_xyz=False)),
    bbox_head=dict(
        type='SSD3DHead',
        in_channels=256,
        vote_module_cfg=dict(
            in_channels=256,
            num_points=256,
            gt_per_seed=1,
            conv_channels=(128, ),
            conv_cfg=dict(type='Conv1d'),
            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
            with_res_feat=False,
            vote_xyz_range=(3.0, 3.0, 2.0)),
        vote_aggregation_cfg=dict(
            type='PointSAModuleMSG',
            num_point=256,
            radii=(4.8, 6.4),
            sample_nums=(16, 32),
            mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
            norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
            use_xyz=True,
            normalize_xyz=False,
            bias=True),
        pred_layer_cfg=dict(
            in_channels=1536,
            shared_conv_channels=(512, 128),
            cls_conv_channels=(128, ),
            reg_conv_channels=(128, ),
            conv_cfg=dict(type='Conv1d'),
            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
            bias=True),
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
        objectness_loss=dict(
            type='CrossEntropyLoss',
            use_sigmoid=True,
            reduction='sum',
            loss_weight=1.0),
        center_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
        dir_class_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        dir_res_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
        size_res_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
        corner_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
        vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
    # model training and testing settings
    train_cfg=dict(
        sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
    test_cfg=dict(
        nms_cfg=dict(type='nms', iou_thr=0.1),
        sample_mod='spec',
        score_thr=0.0,
        per_class_proposal=True,
        max_output_num=100))


================================================
FILE: plugin/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
================================================
# model settings
model = dict(
    type='CascadeRCNN',
    pretrained='torchvision://resnet50',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch'),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
    roi_head=dict(
        type='CascadeRoIHead',
        num_stages=3,
        stage_loss_weights=[1, 0.5, 0.25],
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=[
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.1, 0.1, 0.2, 0.2]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.05, 0.05, 0.1, 0.1]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.033, 0.033, 0.067, 0.067]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
        ],
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=80,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    # model training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=0,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_across_levels=False,
            nms_pre=2000,
            nms_post=2000,
            max_num=2000,
            nms_thr=0.7,
            min_bbox_size=0),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.5,
                    min_pos_iou=0.5,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.6,
                    neg_iou_thr=0.6,
                    min_pos_iou=0.6,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.7,
                    neg_iou_thr=0.7,
                    min_pos_iou=0.7,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False)
        ]),
    test_cfg=dict(
        rpn=dict(
            nms_across_levels=False,
            nms_pre=1000,
            nms_post=1000,
            max_num=1000,
            nms_thr=0.7,
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))


================================================
FILE: plugin/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
================================================
voxel_size = [0.1, 0.1, 0.2]
model = dict(
    type='CenterPoint',
    pts_voxel_layer=dict(
        max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
    pts_middle_encoder=dict(
        type='SparseEncoder',
        in_channels=5,
        sparse_shape=[41, 1024, 1024],
        output_channels=128,
        order=('conv', 'norm', 'act'),
        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
                                                                      128)),
        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
        block_type='basicblock'),
    pts_backbone=dict(
        type='SECOND',
        in_channels=256,
        out_channels=[128, 256],
        layer_nums=[5, 5],
        layer_strides=[1, 2],
        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
        conv_cfg=dict(type='Conv2d', bias=False)),
    pts_neck=dict(
        type='SECONDFPN',
        in_channels=[128, 256],
        out_channels=[256, 256],
        upsample_strides=[1, 2],
        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
        upsample_cfg=dict(type='deconv', bias=False),
        use_conv_for_no_stride=True),
    pts_bbox_head=dict(
        type='CenterHead',
        in_channels=sum([256, 256]),
        tasks=[
            dict(num_class=1, class_names=['car']),
            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
            dict(num_class=2, class_names=['bus', 'trailer']),
            dict(num_class=1, class_names=['barrier']),
            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
        ],
        common_heads=dict(
            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
        share_conv_channel=64,
        bbox_coder=dict(
            type='CenterPointBBoxCoder',
            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            max_num=500,
            score_threshold=0.1,
            out_size_factor=8,
            voxel_size=voxel_size[:2],
            code_size=9),
        separate_head=dict(
            type='SeparateHead', init_bias=-2.19, final_kernel=3),
        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
        norm_bbox=True),
    # model training and testing settings
    train_cfg=dict(
        pts=dict(
            grid_size=[1024, 1024, 40],
            voxel_size=voxel_size,
            out_size_factor=8,
            dense_reg=1,
            gaussian_overlap=0.1,
            max_objs=500,
            min_radius=2,
            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
    test_cfg=dict(
        pts=dict(
            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            max_per_img=500,
            max_pool_nms=False,
            min_radius=[4, 12, 10, 1, 0.85, 0.175],
            score_threshold=0.1,
            out_size_factor=8,
            voxel_size=voxel_size[:2],
            nms_type='rotate',
            pre_max_size=1000,
            post_max_size=83,
            nms_thr=0.2)))


================================================
FILE: plugin/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
================================================
voxel_size = [0.2, 0.2, 8]
model = dict(
    type='CenterPoint',
    pts_voxel_layer=dict(
        max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
    pts_voxel_encoder=dict(
        type='PillarFeatureNet',
        in_channels=5,
        feat_channels=[64],
        with_distance=False,
        voxel_size=(0.2, 0.2, 8),
        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
        legacy=False),
    pts_middle_encoder=dict(
        type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
    pts_backbone=dict(
        type='SECOND',
        in_channels=64,
        out_channels=[64, 128, 256],
        layer_nums=[3, 5, 5],
        layer_strides=[2, 2, 2],
        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
        conv_cfg=dict(type='Conv2d', bias=False)),
    pts_neck=dict(
        type='SECONDFPN',
        in_channels=[64, 128, 256],
        out_channels=[128, 128, 128],
        upsample_strides=[0.5, 1, 2],
        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
        upsample_cfg=dict(type='deconv', bias=False),
        use_conv_for_no_stride=True),
    pts_bbox_head=dict(
        type='CenterHead',
        in_channels=sum([128, 128, 128]),
        tasks=[
            dict(num_class=1, class_names=['car']),
            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
            dict(num_class=2, class_names=['bus', 'trailer']),
            dict(num_class=1, class_names=['barrier']),
            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
        ],
        common_heads=dict(
            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
        share_conv_channel=64,
        bbox_coder=dict(
            type='CenterPointBBoxCoder',
            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            max_num=500,
            score_threshold=0.1,
            out_size_factor=4,
            voxel_size=voxel_size[:2],
            code_size=9),
        separate_head=dict(
            type='SeparateHead', init_bias=-2.19, final_kernel=3),
        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
        norm_bbox=True),
    # model training and testing settings
    train_cfg=dict(
        pts=dict(
            grid_size=[512, 512, 1],
            voxel_size=voxel_size,
            out_size_factor=4,
            dense_reg=1,
            gaussian_overlap=0.1,
            max_objs=500,
            min_radius=2,
            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
    test_cfg=dict(
        pts=dict(
            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            max_per_img=500,
            max_pool_nms=False,
            min_radius=[4, 12, 10, 1, 0.85, 0.175],
            score_threshold=0.1,
            pc_range=[-51.2, -51.2],
            out_size_factor=4,
            voxel_size=voxel_size[:2],
            nms_type='rotate',
            pre_max_size=1000,
            post_max_size=83,
            nms_thr=0.2)))


================================================
FILE: plugin/configs/_base_/models/fcos3d.py
================================================
model = dict(
    type='FCOSMono3D',
    pretrained='open-mmlab://detectron2/resnet101_caffe',
    backbone=dict(
        type='ResNet',
        depth=101,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=False),
        norm_eval=True,
        style='caffe'),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
        add_extra_convs='on_output',
        num_outs=5,
        relu_before_extra_convs=True),
    bbox_head=dict(
        type='FCOSMono3DHead',
        num_classes=10,
        in_channels=256,
        stacked_convs=2,
        feat_channels=256,
        use_direction_classifier=True,
        diff_rad_by_sin=True,
        pred_attrs=True,
        pred_velo=True,
        dir_offset=0.7854,  # pi/4
        strides=[8, 16, 32, 64, 128],
        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
        cls_branch=(256, ),
        reg_branch=(
            (256, ),  # offset
            (256, ),  # depth
            (256, ),  # size
            (256, ),  # rot
            ()  # velo
        ),
        dir_branch=(256, ),
        attr_branch=(256, ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_attr=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_centerness=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        norm_on_bbox=True,
        centerness_on_reg=True,
        center_sampling=True,
        conv_bias=True,
        dcn_on_last_conv=True),
    train_cfg=dict(
        allowed_border=0,
        code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
        pos_weight=-1,
        debug=False),
    test_cfg=dict(
        use_rotate_nms=True,
        nms_across_levels=False,
        nms_pre=1000,
        nms_thr=0.8,
        score_thr=0.05,
        min_bbox_size=0,
        max_per_img=200))


================================================
FILE: plugin/configs/_base_/models/groupfree3d.py
================================================
model = dict(
    type='GroupFree3DNet',
    backbone=dict(
        type='PointNet2SASSG',
        in_channels=3,
        num_points=(2048, 1024, 512, 256),
        radius=(0.2, 0.4, 0.8, 1.2),
        num_samples=(64, 32, 16, 16),
        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
                     (128, 128, 256)),
        fp_channels=((256, 256), (256, 288)),
        norm_cfg=dict(type='BN2d'),
        sa_cfg=dict(
            type='PointSAModule',
            pool_mod='max',
            use_xyz=True,
            normalize_xyz=True)),
    bbox_head=dict(
        type='GroupFree3DHead',
        in_channels=288,
        num_decoder_layers=6,
        num_proposal=256,
        transformerlayers=dict(
            type='BaseTransformerLayer',
            attn_cfgs=dict(
                type='GroupFree3DMHA',
                embed_dims=288,
                num_heads=8,
                attn_drop=0.1,
                dropout_layer=dict(type='Dropout', drop_prob=0.1)),
            ffn_cfgs=dict(
                embed_dims=288,
                feedforward_channels=2048,
                ffn_drop=0.1,
                act_cfg=dict(type='ReLU', inplace=True)),
            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
                             'norm')),
        pred_layer_cfg=dict(
            in_channels=288, shared_conv_channels=(288, 288), bias=True),
        sampling_objectness_loss=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=8.0),
        objectness_loss=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        center_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
        dir_class_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        dir_res_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
        size_class_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        size_res_loss=dict(
            type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
        semantic_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
    # model training and testing settings
    train_cfg=dict(sample_mod='kps'),
    test_cfg=dict(
        sample_mod='kps',
        nms_thr=0.25,
        score_thr=0.0,
        per_class_proposal=True,
        prediction_stages='last'))


================================================
FILE: plugin/configs/_base_/models/h3dnet.py
================================================
primitive_z_cfg = dict(
    type='PrimitiveHead',
    num_dims=2,
    num_classes=18,
    primitive_mode='z',
    upper_thresh=100.0,
    surface_thresh=0.5,
    vote_module_cfg=dict(
        in_channels=256,
        vote_per_seed=1,
        gt_per_seed=1,
        conv_channels=(256, 256),
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d'),
        norm_feats=True,
        vote_loss=dict(
            type='ChamferDistance',
            mode='l1',
            reduction='none',
            loss_dst_weight=10.0)),
    vote_aggregation_cfg=dict(
        type='PointSAModule',
        num_point=1024,
        radius=0.3,
        num_sample=16,
        mlp_channels=[256, 128, 128, 128],
        use_xyz=True,
        normalize_xyz=True),
    feat_channels=(128, 128),
    conv_cfg=dict(type='Conv1d'),
    norm_cfg=dict(type='BN1d'),
    objectness_loss=dict(
        type='CrossEntropyLoss',
        class_weight=[0.4, 0.6],
        reduction='mean',
        loss_weight=30.0),
    center_loss=dict(
        type='ChamferDistance',
        mode='l1',
        reduction='sum',
        loss_src_weight=0.5,
        loss_dst_weight=0.5),
    semantic_reg_loss=dict(
        type='ChamferDistance',
        mode='l1',
        reduction='sum',
        loss_src_weight=0.5,
        loss_dst_weight=0.5),
    semantic_cls_loss=dict(
        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
    train_cfg=dict(
        dist_thresh=0.2,
        var_thresh=1e-2,
        lower_thresh=1e-6,
        num_point=100,
        num_point_line=10,
        line_thresh=0.2))

primitive_xy_cfg = dict(
    type='PrimitiveHead',
    num_dims=1,
    num_classes=18,
    primitive_mode='xy',
    upper_thresh=100.0,
    surface_thresh=0.5,
    vote_module_cfg=dict(
        in_channels=256,
        vote_per_seed=1,
        gt_per_seed=1,
        conv_channels=(256, 256),
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d'),
        norm_feats=True,
        vote_loss=dict(
            type='ChamferDistance',
            mode='l1',
            reduction='none',
            loss_dst_weight=10.0)),
    vote_aggregation_cfg=dict(
        type='PointSAModule',
        num_point=1024,
        radius=0.3,
        num_sample=16,
        mlp_channels=[256, 128, 128, 128],
        use_xyz=True,
        normalize_xyz=True),
    feat_channels=(128, 128),
    conv_cfg=dict(type='Conv1d'),
    norm_cfg=dict(type='BN1d'),
    objectness_loss=dict(
        type='CrossEntropyLoss',
        class_weight=[0.4, 0.6],
        reduction='mean',
        loss_weight=30.0),
    center_loss=dict(
        type='ChamferDistance',
        mode='l1',
        reduction='sum',
        loss_src_weight=0.5,
        loss_dst_weight=0.5),
    semantic_reg_loss=dict(
        type='ChamferDistance',
        mode='l1',
        reduction='sum',
        loss_src_weight=0.5,
        loss_dst_weight=0.5),
    semantic_cls_loss=dict(
        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
    train_cfg=dict(
        dist_thresh=0.2,
        var_thresh=1e-2,
        lower_thresh=1e-6,
        num_point=100,
        num_point_line=10,
        line_thresh=0.2))

primitive_line_cfg = dict(
    type='PrimitiveHead',
    num_dims=0,
    num_classes=18,
    primitive_mode='line',
    upper_thresh=100.0,
    surface_thresh=0.5,
    vote_module_cfg=dict(
        in_channels=256,
        vote_per_seed=1,
        gt_per_seed=1,
        conv_channels=(256, 256),
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d'),
        norm_feats=True,
        vote_loss=dict(
            type='ChamferDistance',
            mode='l1',
            reduction='none',
            loss_dst_weight=10.0)),
    vote_aggregation_cfg=dict(
        type='PointSAModule',
        num_point=1024,
        radius=0.3,
        num_sample=16,
        mlp_channels=[256, 128, 128, 128],
        use_xyz=True,
        normalize_xyz=True),
    feat_channels=(128, 128),
    conv_cfg=dict(type='Conv1d'),
    norm_cfg=dict(type='BN1d'),
    objectness_loss=dict(
        type='CrossEntropyLoss',
        class_weight=[0.4, 0.6],
        reduction='mean',
        loss_weight=30.0),
    center_loss=dict(
        type='ChamferDistance',
        mode='l1',
        reduction='sum',
        loss_src_weight=1.0,
        loss_dst_weight=1.0),
    semantic_reg_loss=dict(
        type='ChamferDistance',
        mode='l1',
        reduction='sum',
        loss_src_weight=1.0,
        loss_dst_weight=1.0),
    semantic_cls_loss=dict(
        type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
    train_cfg=dict(
        dist_thresh=0.2,
        var_thresh=1e-2,
        lower_thresh=1e-6,
        num_point=100,
        num_point_line=10,
        line_thresh=0.2))

model = dict(
    type='H3DNet',
    backbone=dict(
        type='MultiBackbone',
        num_streams=4,
        suffixes=['net0', 'net1', 'net2', 'net3'],
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
        act_cfg=dict(type='ReLU'),
        backbones=dict(
            type='PointNet2SASSG',
            in_channels=4,
            num_points=(2048, 1024, 512, 256),
            radius=(0.2, 0.4, 0.8, 1.2),
            num_samples=(64, 32, 16, 16),
            sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
                         (128, 128, 256)),
            fp_channels=((256, 256), (256, 256)),
            norm_cfg=dict(type='BN2d'),
            sa_cfg=dict(
                type='PointSAModule',
                pool_mod='max',
                use_xyz=True,
                normalize_xyz=True))),
    rpn_head=dict(
        type='VoteHead',
        vote_module_cfg=dict(
            in_channels=256,
            vote_per_seed=1,
            gt_per_seed=3,
            conv_channels=(256, 256),
            conv_cfg=dict(type='Conv1d'),
            norm_cfg=dict(type='BN1d'),
            norm_feats=True,
            vote_loss=dict(
                type='ChamferDistance',
                mode='l1',
                reduction='none',
                loss_dst_weight=10.0)),
        vote_aggregation_cfg=dict(
            type='PointSAModule',
            num_point=256,
            radius=0.3,
            num_sample=16,
            mlp_channels=[256, 128, 128, 128],
            use_xyz=True,
            normalize_xyz=True),
        pred_layer_cfg=dict(
            in_channels=128, shared_conv_channels=(128, 128), bias=True),
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d'),
        objectness_loss=dict(
            type='CrossEntropyLoss',
            class_weight=[0.2, 0.8],
            reduction='sum',
            loss_weight=5.0),
        center_loss=dict(
            type='ChamferDistance',
            mode='l2',
            reduction='sum',
            loss_src_weight=10.0,
            loss_dst_weight=10.0),
        dir_class_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        dir_res_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
        size_class_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        size_res_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
        semantic_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
    roi_head=dict(
        type='H3DRoIHead',
        primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
        bbox_head=dict(
            type='H3DBboxHead',
            gt_per_seed=3,
            num_proposal=256,
            suface_matching_cfg=dict(
                type='PointSAModule',
                num_point=256 * 6,
                radius=0.5,
                num_sample=32,
                mlp_channels=[128 + 6, 128, 64, 32],
                use_xyz=True,
                normalize_xyz=True),
            line_matching_cfg=dict(
                type='PointSAModule',
                num_point=256 * 12,
                radius=0.5,
                num_sample=32,
                mlp_channels=[128 + 12, 128, 64, 32],
                use_xyz=True,
                normalize_xyz=True),
            feat_channels=(128, 128),
            primitive_refine_channels=[128, 128, 128],
            upper_thresh=100.0,
            surface_thresh=0.5,
            line_thresh=0.5,
            conv_cfg=dict(type='Conv1d'),
            norm_cfg=dict(type='BN1d'),
            objectness_loss=dict(
                type='CrossEntropyLoss',
                class_weight=[0.2, 0.8],
                reduction='sum',
                loss_weight=5.0),
            center_loss=dict(
                type='ChamferDistance',
                mode='l2',
                reduction='sum',
                loss_src_weight=10.0,
                loss_dst_weight=10.0),
            dir_class_loss=dict(
                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
            dir_res_loss=dict(
                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
            size_class_loss=dict(
                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
            size_res_loss=dict(
                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
            semantic_loss=dict(
                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
            cues_objectness_loss=dict(
                type='CrossEntropyLoss',
                class_weight=[0.3, 0.7],
                reduction='mean',
                loss_weight=5.0),
            cues_semantic_loss=dict(
                type='CrossEntropyLoss',
                class_weight=[0.3, 0.7],
                reduction='mean',
                loss_weight=5.0),
            proposal_objectness_loss=dict(
                type='CrossEntropyLoss',
                class_weight=[0.2, 0.8],
                reduction='none',
                loss_weight=5.0),
            primitive_center_loss=dict(
                type='MSELoss', reduction='none', loss_weight=1.0))),
    # model training and testing settings
    train_cfg=dict(
        rpn=dict(
            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
        rpn_proposal=dict(use_nms=False),
        rcnn=dict(
            pos_distance_thr=0.3,
            neg_distance_thr=0.6,
            sample_mod='vote',
            far_threshold=0.6,
            near_threshold=0.3,
            mask_surface_threshold=0.3,
            label_surface_threshold=0.3,
            mask_line_threshold=0.3,
            label_line_threshold=0.3)),
    test_cfg=dict(
        rpn=dict(
            sample_mod='seed',
            nms_thr=0.25,
            score_thr=0.05,
            per_class_proposal=True,
            use_nms=False),
        rcnn=dict(
            sample_mod='seed',
            nms_thr=0.25,
            score_thr=0.05,
            per_class_proposal=True)))


================================================
FILE: plugin/configs/_base_/models/hv_pointpillars_fpn_lyft.py
================================================
_base_ = './hv_pointpillars_fpn_nus.py'

# model settings (based on nuScenes model settings)
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
model = dict(
    pts_voxel_layer=dict(
        max_num_points=20,
        point_cloud_range=[-80, -80, -5, 80, 80, 3],
        max_voxels=(60000, 60000)),
    pts_voxel_encoder=dict(
        feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
    pts_middle_encoder=dict(output_shape=[640, 640]),
    pts_bbox_head=dict(
        num_classes=9,
        anchor_generator=dict(
            ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
    # model training settings (based on nuScenes model settings)
    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))


================================================
FILE: plugin/configs/_base_/models/hv_pointpillars_fpn_nus.py
================================================
# model settings
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
voxel_size = [0.25, 0.25, 8]
model = dict(
    type='MVXFasterRCNN',
    pts_voxel_layer=dict(
        max_num_points=64,
        point_cloud_range=[-50, -50, -5, 50, 50, 3],
        voxel_size=voxel_size,
        max_voxels=(30000, 40000)),
    pts_voxel_encoder=dict(
        type='HardVFE',
        in_channels=4,
        feat_channels=[64, 64],
        with_distance=False,
        voxel_size=voxel_size,
        with_cluster_center=True,
        with_voxel_center=True,
        point_cloud_range=[-50, -50, -5, 50, 50, 3],
        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
    pts_middle_encoder=dict(
        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
    pts_backbone=dict(
        type='SECOND',
        in_channels=64,
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        layer_nums=[3, 5, 5],
        layer_strides=[2, 2, 2],
        out_channels=[64, 128, 256]),
    pts_neck=dict(
        type='FPN',
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        act_cfg=dict(type='ReLU'),
        in_channels=[64, 128, 256],
        out_channels=256,
        start_level=0,
        num_outs=3),
    pts_bbox_head=dict(
        type='Anchor3DHead',
        num_classes=10,
        in_channels=256,
        feat_channels=256,
        use_direction_classifier=True,
        anchor_generator=dict(
            type='AlignedAnchor3DRangeGenerator',
            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
            scales=[1, 2, 4],
            sizes=[
                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
                [1., 1., 1.],
                [0.4, 0.4, 1],
            ],
            custom_values=[0, 0],
            rotations=[0, 1.57],
            reshape_out=True),
        assigner_per_size=False,
        diff_rad_by_sin=True,
        dir_offset=0.7854,  # pi/4
        dir_limit_offset=0,
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
    # model training and testing settings
    train_cfg=dict(
        pts=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.6,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                ignore_iof_thr=-1),
            allowed_border=0,
            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        pts=dict(
            use_rotate_nms=True,
            nms_across_levels=False,
            nms_pre=1000,
            nms_thr=0.2,
            score_thr=0.05,
            min_bbox_size=0,
            max_num=500)))


================================================
FILE: plugin/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
================================================
_base_ = './hv_pointpillars_fpn_nus.py'

# model settings (based on nuScenes model settings)
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
model = dict(
    pts_voxel_layer=dict(
        max_num_points=20,
        point_cloud_range=[-100, -100, -5, 100, 100, 3],
        max_voxels=(60000, 60000)),
    pts_voxel_encoder=dict(
        feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
    pts_middle_encoder=dict(output_shape=[800, 800]),
    pts_bbox_head=dict(
        num_classes=9,
        anchor_generator=dict(
            ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
    # model training settings (based on nuScenes model settings)
    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))


================================================
FILE: plugin/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
================================================
voxel_size = [0.16, 0.16, 4]

model = dict(
    type='VoxelNet',
    voxel_layer=dict(
        max_num_points=32,  # max_points_per_voxel
        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)  # (training, testing) max_voxels
    ),
    voxel_encoder=dict(
        type='PillarFeatureNet',
        in_channels=4,
        feat_channels=[64],
        with_distance=False,
        voxel_size=voxel_size,
        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
    middle_encoder=dict(
        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
    backbone=dict(
        type='SECOND',
        in_channels=64,
        layer_nums=[3, 5, 5],
        layer_strides=[2, 2, 2],
        out_channels=[64, 128, 256]),
    neck=dict(
        type='SECONDFPN',
        in_channels=[64, 128, 256],
        upsample_strides=[1, 2, 4],
        out_channels=[128, 128, 128]),
    bbox_head=dict(
        type='Anchor3DHead',
        num_classes=3,
        in_channels=384,
        feat_channels=384,
        use_direction_classifier=True,
        anchor_generator=dict(
            type='Anchor3DRangeGenerator',
            ranges=[
                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
                [0, -39.68, -1.78, 70.4, 39.68, -1.78],
            ],
            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
            rotations=[0, 1.57],
            reshape_out=False),
        diff_rad_by_sin=True,
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
        loss_dir=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
    # model training and testing settings
    train_cfg=dict(
        assigner=[
            dict(  # for Pedestrian
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.5,
                neg_iou_thr=0.35,
                min_pos_iou=0.35,
                ignore_iof_thr=-1),
            dict(  # for Cyclist
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.5,
                neg_iou_thr=0.35,
                min_pos_iou=0.35,
                ignore_iof_thr=-1),
            dict(  # for Car
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.6,
                neg_iou_thr=0.45,
                min_pos_iou=0.45,
                ignore_iof_thr=-1),
        ],
        allowed_border=0,
        pos_weight=-1,
        debug=False),
    test_cfg=dict(
        use_rotate_nms=True,
        nms_across_levels=False,
        nms_thr=0.01,
        score_thr=0.1,
        min_bbox_size=0,
        nms_pre=100,
        max_num=50))


================================================
FILE: plugin/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
================================================
# model settings
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
voxel_size = [0.32, 0.32, 6]
model = dict(
    type='MVXFasterRCNN',
    pts_voxel_layer=dict(
        max_num_points=20,
        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
        voxel_size=voxel_size,
        max_voxels=(32000, 32000)),
    pts_voxel_encoder=dict(
        type='HardVFE',
        in_channels=5,
        feat_channels=[64],
        with_distance=False,
        voxel_size=voxel_size,
        with_cluster_center=True,
        with_voxel_center=True,
        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
    pts_middle_encoder=dict(
        type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
    pts_backbone=dict(
        type='SECOND',
        in_channels=64,
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        layer_nums=[3, 5, 5],
        layer_strides=[1, 2, 2],
        out_channels=[64, 128, 256]),
    pts_neck=dict(
        type='SECONDFPN',
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        in_channels=[64, 128, 256],
        upsample_strides=[1, 2, 4],
        out_channels=[128, 128, 128]),
    pts_bbox_head=dict(
        type='Anchor3DHead',
        num_classes=3,
        in_channels=384,
        feat_channels=384,
        use_direction_classifier=True,
        anchor_generator=dict(
            type='AlignedAnchor3DRangeGenerator',
            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
                    [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
                    [-74.88, -74.88, 0, 74.88, 74.88, 0]],
            sizes=[
                [2.08, 4.73, 1.77],  # car
                [0.84, 1.81, 1.77],  # cyclist
                [0.84, 0.91, 1.74]  # pedestrian
            ],
            rotations=[0, 1.57],
            reshape_out=False),
        diff_rad_by_sin=True,
        dir_offset=0.7854,  # pi/4
        dir_limit_offset=0,
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
    # model training and testing settings
    train_cfg=dict(
        pts=dict(
            assigner=[
                dict(  # car
                    type='MaxIoUAssigner',
                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
                    pos_iou_thr=0.55,
                    neg_iou_thr=0.4,
                    min_pos_iou=0.4,
                    ignore_iof_thr=-1),
                dict(  # cyclist
                    type='MaxIoUAssigner',
                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.3,
                    min_pos_iou=0.3,
                    ignore_iof_thr=-1),
                dict(  # pedestrian
                    type='MaxIoUAssigner',
                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.3,
                    min_pos_iou=0.3,
                    ignore_iof_thr=-1),
            ],
            allowed_border=0,
            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        pts=dict(
            use_rotate_nms=True,
            nms_across_levels=False,
            nms_pre=4096,
            nms_thr=0.25,
            score_thr=0.1,
            min_bbox_size=0,
            max_num=500)))


================================================
FILE: plugin/configs/_base_/models/hv_second_secfpn_kitti.py
================================================
voxel_size = [0.05, 0.05, 0.1]

model = dict(
    type='VoxelNet',
    voxel_layer=dict(
        max_num_points=5,
        point_cloud_range=[0, -40, -3, 70.4, 40, 1],
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)),
    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseEncoder',
        in_channels=4,
        sparse_shape=[41, 1600, 1408],
        order=('conv', 'norm', 'act')),
    backbone=dict(
        type='SECOND',
        in_channels=256,
        layer_nums=[5, 5],
        layer_strides=[1, 2],
        out_channels=[128, 256]),
    neck=dict(
        type='SECONDFPN',
        in_channels=[128, 256],
        upsample_strides=[1, 2],
        out_channels=[256, 256]),
    bbox_head=dict(
        type='Anchor3DHead',
        num_classes=3,
        in_channels=512,
        feat_channels=512,
        use_direction_classifier=True,
        anchor_generator=dict(
            type='Anchor3DRangeGenerator',
            ranges=[
                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
            ],
            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
            rotations=[0, 1.57],
            reshape_out=False),
        diff_rad_by_sin=True,
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
        loss_dir=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
    # model training and testing settings
    train_cfg=dict(
        assigner=[
            dict(  # for Pedestrian
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.35,
                neg_iou_thr=0.2,
                min_pos_iou=0.2,
                ignore_iof_thr=-1),
            dict(  # for Cyclist
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.35,
                neg_iou_thr=0.2,
                min_pos_iou=0.2,
                ignore_iof_thr=-1),
            dict(  # for Car
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.6,
                neg_iou_thr=0.45,
                min_pos_iou=0.45,
                ignore_iof_thr=-1),
        ],
        allowed_border=0,
        pos_weight=-1,
        debug=False),
    test_cfg=dict(
        use_rotate_nms=True,
        nms_across_levels=False,
        nms_thr=0.01,
        score_thr=0.1,
        min_bbox_size=0,
        nms_pre=100,
        max_num=50))


================================================
FILE: plugin/configs/_base_/models/hv_second_secfpn_waymo.py
================================================
# model settings
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
voxel_size = [0.08, 0.08, 0.1]
model = dict(
    type='VoxelNet',
    voxel_layer=dict(
        max_num_points=10,
        point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
        voxel_size=voxel_size,
        max_voxels=(80000, 90000)),
    voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
    middle_encoder=dict(
        type='SparseEncoder',
        in_channels=5,
        sparse_shape=[61, 1280, 1920],
        order=('conv', 'norm', 'act')),
    backbone=dict(
        type='SECOND',
        in_channels=384,
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        layer_nums=[5, 5],
        layer_strides=[1, 2],
        out_channels=[128, 256]),
    neck=dict(
        type='SECONDFPN',
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        in_channels=[128, 256],
        upsample_strides=[1, 2],
        out_channels=[256, 256]),
    bbox_head=dict(
        type='Anchor3DHead',
        num_classes=3,
        in_channels=512,
        feat_channels=512,
        use_direction_classifier=True,
        anchor_generator=dict(
            type='AlignedAnchor3DRangeGenerator',
            ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
                    [-76.8, -51.2, 0, 76.8, 51.2, 0],
                    [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
            sizes=[
                [2.08, 4.73, 1.77],  # car
                [0.84, 0.91, 1.74],  # pedestrian
                [0.84, 1.81, 1.77]  # cyclist
            ],
            rotations=[0, 1.57],
            reshape_out=False),
        diff_rad_by_sin=True,
        dir_offset=0.7854,  # pi/4
        dir_limit_offset=0,
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
    # model training and testing settings
    train_cfg=dict(
        assigner=[
            dict(  # car
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.55,
                neg_iou_thr=0.4,
                min_pos_iou=0.4,
                ignore_iof_thr=-1),
            dict(  # pedestrian
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.5,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                ignore_iof_thr=-1),
            dict(  # cyclist
                type='MaxIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.5,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                ignore_iof_thr=-1)
        ],
        allowed_border=0,
        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
        pos_weight=-1,
        debug=False),
    test_cfg=dict(
        use_rotate_nms=True,
        nms_across_levels=False,
        nms_pre=4096,
        nms_thr=0.25,
        score_thr=0.1,
        min_bbox_size=0,
        max_num=500))


================================================
FILE: plugin/configs/_base_/models/imvotenet_image.py
================================================
model = dict(
    type='ImVoteNet',
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=False),
        norm_eval=True,
        style='caffe'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    img_rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    img_roi_head=dict(
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=dict(
            type='Shared2FCBBoxHead',
            in_channels=256,
            fc_out_channels=1024,
            roi_feat_size=7,
            num_classes=10,
            bbox_coder=dict(
                type='DeltaXYWHBBoxCoder',
                target_means=[0., 0., 0., 0.],
                target_stds=[0.1, 0.1, 0.2, 0.2]),
            reg_class_agnostic=False,
            loss_cls=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),

    # model training and testing settings
    train_cfg=dict(
        img_rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        img_rpn_proposal=dict(
            nms_across_levels=False,
            nms_pre=2000,
            nms_post=1000,
            max_num=1000,
            nms_thr=0.7,
            min_bbox_size=0),
        img_rcnn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
                match_low_quality=False,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,
                add_gt_as_proposals=True),
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        img_rpn=dict(
            nms_across_levels=False,
            nms_pre=1000,
            nms_post=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        img_rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100)))


================================================
FILE: plugin/configs/_base_/models/mask_rcnn_r50_fpn.py
================================================
# model settings
model = dict(
    type='MaskRCNN',
    pretrained='torchvision://resnet50',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch'),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    roi_head=dict(
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=dict(
            type='Shared2FCBBoxHead',
            in_channels=256,
            fc_out_channels=1024,
            roi_feat_size=7,
            num_classes=80,
            bbox_coder=dict(
                type='DeltaXYWHBBoxCoder',
                target_means=[0., 0., 0., 0.],
                target_stds=[0.1, 0.1, 0.2, 0.2]),
            reg_class_agnostic=False,
            loss_cls=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=80,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    # model training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_across_levels=False,
            nms_pre=2000,
            nms_post=1000,
            max_num=1000,
            nms_thr=0.7,
            min_bbox_size=0),
        rcnn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,
                add_gt_as_proposals=True),
            mask_size=28,
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        rpn=dict(
            nms_across_levels=False,
            nms_pre=1000,
            nms_post=1000,
            max_num=1000,
            nms_thr=0.7,
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))


================================================
FILE: plugin/configs/_base_/models/paconv_cuda_ssg.py
================================================
_base_ = './paconv_ssg.py'

model = dict(
    backbone=dict(
        sa_cfg=dict(
            type='PAConvCUDASAModule',
            scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))


================================================
FILE: plugin/configs/_base_/models/paconv_ssg.py
================================================
# model settings
model = dict(
    type='EncoderDecoder3D',
    backbone=dict(
        type='PointNet2SASSG',
        in_channels=9,  # [xyz, rgb, normalized_xyz]
        num_points=(1024, 256, 64, 16),
        radius=(None, None, None, None),  # use kNN instead of ball query
        num_samples=(32, 32, 32, 32),
        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
                                                                    512)),
        fp_channels=(),
        norm_cfg=dict(type='BN2d', momentum=0.1),
        sa_cfg=dict(
            type='PAConvSAModule',
            pool_mod='max',
            use_xyz=True,
            normalize_xyz=False,
            paconv_num_kernels=[16, 16, 16],
            paconv_kernel_input='w_neighbor',
            scorenet_input='w_neighbor_dist',
            scorenet_cfg=dict(
                mlp_channels=[16, 16, 16],
                score_norm='softmax',
                temp_factor=1.0,
                last_bn=False))),
    decode_head=dict(
        type='PAConvHead',
        # PAConv model's decoder takes skip connections from beckbone
        # different from PointNet++, it also concats input features in the last
        # level of decoder, leading to `128 + 6` as the channel number
        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
                     (128 + 6, 128, 128, 128)),
        channels=128,
        dropout_ratio=0.5,
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d'),
        act_cfg=dict(type='ReLU'),
        loss_decode=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            class_weight=None,  # should be modified with dataset
            loss_weight=1.0)),
    # correlation loss to regularize PAConv's kernel weights
    loss_regularization=dict(
        type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
    # model training and testing settings
    train_cfg=dict(),
    test_cfg=dict(mode='slide'))


================================================
FILE: plugin/configs/_base_/models/parta2.py
================================================
# model settings
voxel_size = [0.05, 0.05, 0.1]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]

model = dict(
    type='PartA2',
    voxel_layer=dict(
        max_num_points=5,  # max_points_per_voxel
        point_cloud_range=point_cloud_range,
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)  # (training, testing) max_voxels
    ),
    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseUNet',
        in_channels=4,
        sparse_shape=[41, 1600, 1408],
        order=('conv', 'norm', 'act')),
    backbone=dict(
        type='SECOND',
        in_channels=256,
        layer_nums=[5, 5],
        layer_strides=[1, 2],
        out_channels=[128, 256]),
    neck=dict(
        type='SECONDFPN',
        in_channels=[128, 256],
        upsample_strides=[1, 2],
        out_channels=[256, 256]),
    rpn_head=dict(
        type='PartA2RPNHead',
        num_classes=3,
        in_channels=512,
        feat_channels=512,
        use_direction_classifier=True,
        anchor_generator=dict(
            type='Anchor3DRangeGenerator',
            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
            rotations=[0, 1.57],
            reshape_out=False),
        diff_rad_by_sin=True,
        assigner_per_size=True,
        assign_per_class=True,
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
        loss_dir=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
    roi_head=dict(
        type='PartAggregationROIHead',
        num_classes=3,
        semantic_head=dict(
            type='PointwiseSemanticHead',
            in_channels=16,
            extra_width=0.2,
            seg_score_thr=0.3,
            num_classes=3,
            loss_seg=dict(
                type='FocalLoss',
                use_sigmoid=True,
                reduction='sum',
                gamma=2.0,
                alpha=0.25,
                loss_weight=1.0),
            loss_part=dict(
                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
        seg_roi_extractor=dict(
            type='Single3DRoIAwareExtractor',
            roi_layer=dict(
                type='RoIAwarePool3d',
                out_size=14,
                max_pts_per_voxel=128,
                mode='max')),
        part_roi_extractor=dict(
            type='Single3DRoIAwareExtractor',
            roi_layer=dict(
                type='RoIAwarePool3d',
                out_size=14,
                max_pts_per_voxel=128,
                mode='avg')),
        bbox_head=dict(
            type='PartA2BboxHead',
            num_classes=3,
            seg_in_channels=16,
            part_in_channels=4,
            seg_conv_channels=[64, 64],
            part_conv_channels=[64, 64],
            merge_conv_channels=[128, 128],
            down_conv_channels=[128, 256],
            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
            shared_fc_channels=[256, 512, 512, 512],
            cls_channels=[256, 256],
            reg_channels=[256, 256],
            dropout_ratio=0.1,
            roi_feat_size=14,
            with_corner_loss=True,
            loss_bbox=dict(
                type='SmoothL1Loss',
                beta=1.0 / 9.0,
                reduction='sum',
                loss_weight=1.0),
            loss_cls=dict(
                type='CrossEntropyLoss',
                use_sigmoid=True,
                reduction='sum',
                loss_weight=1.0))),
    # model training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=[
                dict(  # for Pedestrian
                    type='MaxIoUAssigner',
                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.35,
                    min_pos_iou=0.35,
                    ignore_iof_thr=-1),
                dict(  # for Cyclist
                    type='MaxIoUAssigner',
                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.35,
                    min_pos_iou=0.35,
                    ignore_iof_thr=-1),
                dict(  # for Car
                    type='MaxIoUAssigner',
                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
                    pos_iou_thr=0.6,
                    neg_iou_thr=0.45,
                    min_pos_iou=0.45,
                    ignore_iof_thr=-1)
            ],
            allowed_border=0,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_pre=9000,
            nms_post=512,
            max_num=512,
            nms_thr=0.8,
            score_thr=0,
            use_rotate_nms=False),
        rcnn=dict(
            assigner=[
                dict(  # for Pedestrian
                    type='MaxIoUAssigner',
                    iou_calculator=dict(
                        type='BboxOverlaps3D', coordinate='lidar'),
                    pos_iou_thr=0.55,
                    neg_iou_thr=0.55,
                    min_pos_iou=0.55,
                    ignore_iof_thr=-1),
                dict(  # for Cyclist
                    type='MaxIoUAssigner',
                    iou_calculator=dict(
                        type='BboxOverlaps3D', coordinate='lidar'),
                    pos_iou_thr=0.55,
                    neg_iou_thr=0.55,
                    min_pos_iou=0.55,
                    ignore_iof_thr=-1),
                dict(  # for Car
                    type='MaxIoUAssigner',
                    iou_calculator=dict(
                        type='BboxOverlaps3D', coordinate='lidar'),
                    pos_iou_thr=0.55,
                    neg_iou_thr=0.55,
                    min_pos_iou=0.55,
                    ignore_iof_thr=-1)
            ],
            sampler=dict(
                type='IoUNegPiecewiseSampler',
                num=128,
                pos_fraction=0.55,
                neg_piece_fractions=[0.8, 0.2],
                neg_iou_piece_thrs=[0.55, 0.1],
                neg_pos_ub=-1,
                add_gt_as_proposals=False,
                return_iou=True),
            cls_pos_thr=0.75,
            cls_neg_thr=0.25)),
    test_cfg=dict(
        rpn=dict(
            nms_pre=1024,
            nms_post=100,
            max_num=100,
            nms_thr=0.7,
            score_thr=0,
            use_rotate_nms=True),
        rcnn=dict(
            use_rotate_nms=True,
            use_raw_score=True,
            nms_thr=0.01,
            score_thr=0.1)))


================================================
FILE: plugin/configs/_base_/models/pointnet2_msg.py
================================================
_base_ = './pointnet2_ssg.py'

# model settings
model = dict(
    backbone=dict(
        _delete_=True,
        type='PointNet2SAMSG',
        in_channels=6,  # [xyz, rgb], should be modified with dataset
        num_points=(1024, 256, 64, 16),
        radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
                                                                    128)),
                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
                                                          (256, 384, 512))),
        aggregation_channels=(None, None, None, None),
        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
        dilated_group=(False, False, False, False),
        out_indices=(0, 1, 2, 3),
        sa_cfg=dict(
            type='PointSAModuleMSG',
            pool_mod='max',
            use_xyz=True,
            normalize_xyz=False)),
    decode_head=dict(
        fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
                     (128, 128, 128, 128))))


================================================
FILE: plugin/configs/_base_/models/pointnet2_ssg.py
================================================
# model settings
model = dict(
    type='EncoderDecoder3D',
    backbone=dict(
        type='PointNet2SASSG',
        in_channels=6,  # [xyz, rgb], should be modified with dataset
        num_points=(1024, 256, 64, 16),
        radius=(0.1, 0.2, 0.4, 0.8),
        num_samples=(32, 32, 32, 32),
        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
                                                                    512)),
        fp_channels=(),
        norm_cfg=dict(type='BN2d'),
        sa_cfg=dict(
            type='PointSAModule',
            pool_mod='max',
            use_xyz=True,
            normalize_xyz=False)),
    decode_head=dict(
        type='PointNet2Head',
        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
                     (128, 128, 128, 128)),
        channels=128,
        dropout_ratio=0.5,
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d'),
        act_cfg=dict(type='ReLU'),
        loss_decode=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            class_weight=None,  # should be modified with dataset
            loss_weight=1.0)),
    # model training and testing settings
    train_cfg=dict(),
    test_cfg=dict(mode='slide'))


================================================
FILE: plugin/configs/_base_/models/votenet.py
================================================
model = dict(
    type='VoteNet',
    backbone=dict(
        type='PointNet2SASSG',
        in_channels=4,
        num_points=(2048, 1024, 512, 256),
        radius=(0.2, 0.4, 0.8, 1.2),
        num_samples=(64, 32, 16, 16),
        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
                     (128, 128, 256)),
        fp_channels=((256, 256), (256, 256)),
        norm_cfg=dict(type='BN2d'),
        sa_cfg=dict(
            type='PointSAModule',
            pool_mod='max',
            use_xyz=True,
            normalize_xyz=True)),
    bbox_head=dict(
        type='VoteHead',
        vote_module_cfg=dict(
            in_channels=256,
            vote_per_seed=1,
            gt_per_seed=3,
            conv_channels=(256, 256),
            conv_cfg=dict(type='Conv1d'),
            norm_cfg=dict(type='BN1d'),
            norm_feats=True,
            vote_loss=dict(
                type='ChamferDistance',
                mode='l1',
                reduction='none',
                loss_dst_weight=10.0)),
        vote_aggregation_cfg=dict(
            type='PointSAModule',
            num_point=256,
            radius=0.3,
            num_sample=16,
            mlp_channels=[256, 128, 128, 128],
            use_xyz=True,
            normalize_xyz=True),
        pred_layer_cfg=dict(
            in_channels=128, shared_conv_channels=(128, 128), bias=True),
        conv_cfg=dict(type='Conv1d'),
        norm_cfg=dict(type='BN1d'),
        objectness_loss=dict(
            type='CrossEntropyLoss',
            class_weight=[0.2, 0.8],
            reduction='sum',
            loss_weight=5.0),
        center_loss=dict(
            type='ChamferDistance',
            mode='l2',
            reduction='sum',
            loss_src_weight=10.0,
            loss_dst_weight=10.0),
        dir_class_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        dir_res_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
        size_class_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        size_res_loss=dict(
            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
        semantic_loss=dict(
            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
    # model training and testing settings
    train_cfg=dict(
        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
    test_cfg=dict(
        sample_mod='seed',
        nms_thr=0.25,
        score_thr=0.05,
        per_class_proposal=True))


================================================
FILE: plugin/configs/_base_/schedules/cosine.py
================================================
# This schedule is mainly used by models with dynamic voxelization
# optimizer
lr = 0.003  # max learning rate
optimizer = dict(
    type='AdamW',
    lr=lr,
    betas=(0.95, 0.99),  # the momentum is change during training
    weight_decay=0.001)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))

lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=1.0 / 10,
    min_lr_ratio=1e-5)

momentum_config = None

runner = dict(type='EpochBasedRunner', max_epochs=40)


================================================
FILE: plugin/configs/_base_/schedules/cyclic_20e.py
================================================
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 20. Please change the interval accordingly if you do not
# use a default schedule.
# optimizer
# This schedule is mainly used by models on nuScenes dataset
optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
# max_norm=10 is better for SECOND
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='cyclic',
    target_ratio=(10, 1e-4),
    cyclic_times=1,
    step_ratio_up=0.4,
)
momentum_config = dict(
    policy='cyclic',
    target_ratio=(0.85 / 0.95, 1),
    cyclic_times=1,
    step_ratio_up=0.4,
)

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=20)


================================================
FILE: plugin/configs/_base_/schedules/cyclic_40e.py
================================================
# The schedule is usually used by models trained on KITTI dataset

# The learning rate set in the cyclic schedule is the initial learning rate
# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
lr = 0.0018
# The optimizer follows the setting in SECOND.Pytorch, but here we use
# the offcial AdamW optimizer implemented by PyTorch.
optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69  # noqa
# We implement them in mmcv, for more details, please refer to
# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327  # noqa
# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130  # noqa
lr_config = dict(
    policy='cyclic',
    target_ratio=(10, 1e-4),
    cyclic_times=1,
    step_ratio_up=0.4,
)
momentum_config = dict(
    policy='cyclic',
    target_ratio=(0.85 / 0.95, 1),
    cyclic_times=1,
    step_ratio_up=0.4,
)
# Although the max_epochs is 40, this schedule is usually used we
# RepeatDataset with repeat ratio N, thus the actual max epoch
# number could be Nx40
runner = dict(type='EpochBasedRunner', max_epochs=40)


================================================
FILE: plugin/configs/_base_/schedules/mmdet_schedule_1x.py
================================================
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.001,
    step=[8, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: plugin/configs/_base_/schedules/schedule_2x.py
================================================
# optimizer
# This schedule is mainly used by models on nuScenes dataset
optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
# max_norm=10 is better for SECOND
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=1.0 / 1000,
    step=[20, 23])
momentum_config = None
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=24)


================================================
FILE: plugin/configs/_base_/schedules/schedule_3x.py
================================================
# optimizer
# This schedule is mainly used by models on indoor dataset,
# e.g., VoteNet on SUNRGBD and ScanNet
lr = 0.008  # max learning rate
optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[24, 32])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=36)


================================================
FILE: plugin/configs/_base_/schedules/seg_cosine_150e.py
================================================
# optimizer
# This schedule is mainly used on S3DIS dataset in segmentation task
optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
momentum_config = None

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=150)


================================================
FILE: plugin/configs/_base_/schedules/seg_cosine_200e.py
================================================
# optimizer
# This schedule is mainly used on ScanNet dataset in segmentation task
optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
momentum_config = None

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=200)


================================================
FILE: plugin/configs/_base_/schedules/seg_cosine_50e.py
================================================
# optimizer
# This schedule is mainly used on S3DIS dataset in segmentation task
optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
momentum_config = None

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=50)


================================================
FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_100x50_newsplit_5frame_span10_stage1_bev_pretrain.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 1
num_iters_per_epoch = 29293 // (num_gpus * batch_size)
num_epochs = 12
num_epochs_interval = num_epochs // 6
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (100, 50) # bev range, 100m in x-axis, 50m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=True,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=False,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
            semantic_mask=True
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
        eval_semantic=True,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
        eval_semantic=True,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=5e-2)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True


================================================
FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_100x50_newsplit_5frame_span10_stage2_warmup.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 6
num_iters_per_epoch = 29293 // (num_gpus * batch_size)
num_epochs = 3
num_epochs_interval = num_epochs
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (100, 50) # bev range, 100m in x-axis, 50m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=True,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='PV_Map', img_shape=img_size, 
        feat_down_sample=8,
        thickness=1, 
        coords_dim=coords_dim,
        pv_mask=True,
        num_cams=num_cams,
        num_coords=3,
    ),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask', 'pv_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=0.95)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = 'work_dirs/maptracker_av2_100x50_newsplit_5frame_span10_stage1_bev_pretrain/latest.pth'


================================================
FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_100x50_newsplit_5frame_span10_stage3_joint_finetune.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 2
num_iters_per_epoch = 29293 // (num_gpus * batch_size)
num_epochs = 20
num_epochs_interval = num_epochs // 5
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (100, 50) # bev range, 100m in x-axis, 50m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=-1,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'backbone.img_backbone': dict(lr_mult=0.1),
            'backbone.img_neck': dict(lr_mult=0.5),
            'backbone.transformer': dict(lr_mult=0.5),
            'backbone.positional_encoding': dict(lr_mult=0.5),
            'seg_decoder': dict(lr_mult=0.5),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=3e-3)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = 'work_dirs/maptracker_av2_100x50_newsplit_5frame_span10_stage2_warmup/latest.pth'


================================================
FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_newsplit_5frame_span10_stage1_bev_pretrain.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 1
num_iters_per_epoch = 29293 // (num_gpus * batch_size)
num_epochs = 12
num_epochs_interval = num_epochs // 6
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=True,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=False,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
            semantic_mask=True
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
        eval_semantic=True,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
        eval_semantic=True,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=5e-2)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True


================================================
FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_newsplit_5frame_span10_stage2_warmup.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 6
num_iters_per_epoch = 29293 // (num_gpus * batch_size)
num_epochs = 3
num_epochs_interval = num_epochs
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=True,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='PV_Map', img_shape=img_size, 
        feat_down_sample=8,
        thickness=1, 
        coords_dim=coords_dim,
        pv_mask=True,
        num_cams=num_cams,
        num_coords=3,
    ),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask', 'pv_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=0.95)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = 'work_dirs/maptracker_av2_newsplit_5frame_span10_stage1_bev_pretrain/latest.pth'


================================================
FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_newsplit_5frame_span10_stage3_joint_finetune.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 2
num_iters_per_epoch = 29293 // (num_gpus * batch_size)
num_epochs = 20
num_epochs_interval = num_epochs // 5
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=-1,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'backbone.img_backbone': dict(lr_mult=0.1),
            'backbone.img_neck': dict(lr_mult=0.5),
            'backbone.transformer': dict(lr_mult=0.5),
            'backbone.positional_encoding': dict(lr_mult=0.5),
            'seg_decoder': dict(lr_mult=0.5),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=3e-3)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = 'work_dirs/maptracker_av2_newsplit_5frame_span10_stage2_warmup/latest.pth'


================================================
FILE: plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage1_bev_pretrain.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 1
num_iters_per_epoch = 27243 // (num_gpus * batch_size)
num_epochs = 12
num_epochs_interval = num_epochs // 6
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=True,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=False,
    mem_len=4,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
            semantic_mask=True
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
        eval_semantic=True,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
        eval_semantic=True,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=5e-2)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True


================================================
FILE: plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage2_warmup.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 6
num_iters_per_epoch = 27243 // (num_gpus * batch_size)
num_epochs = 3
num_epochs_interval = num_epochs
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=True,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            #pretrained='torchvision://resnet18',
            depth=50,
            #depth=18,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            #in_channels=[128, 256, 512],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    # operation_order=('norm', 'self_attn', 'norm', 'cross_attn',
                    #                 'norm', 'ffn',)
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='PV_Map', img_shape=img_size, 
        feat_down_sample=8,
        thickness=1, 
        coords_dim=coords_dim,
        pv_mask=True,
        num_cams=num_cams,
        num_coords=3,
    ),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask', 'pv_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=0.95)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = 'work_dirs/maptracker_av2_oldsplit_5frame_span10_stage1_bev_pretrain/latest.pth'


================================================
FILE: plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage3_joint_finetune.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 608
img_w = 608
img_size = (img_h, img_w)
num_cams = 7

num_gpus = 8
batch_size = 2
num_iters_per_epoch = 27243 // (num_gpus * batch_size)
num_epochs = 20
num_epochs_interval = num_epochs // 5
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=-1,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            num_cams=num_cams,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                            num_cams=num_cams,
                        ),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img',], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)


match_config = dict(
    type='AV2Dataset',
    ann_file='./datasets/av2/av2_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=4,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_train.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        interval=4,
    ),
    val=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    test=dict(
        type='AV2Dataset',
        ann_file='./datasets/av2/av2_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        interval=4,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'backbone.img_backbone': dict(lr_mult=0.1),
            'backbone.img_neck': dict(lr_mult=0.5),
            'backbone.transformer': dict(lr_mult=0.5),
            'backbone.positional_encoding': dict(lr_mult=0.5),
            'seg_decoder': dict(lr_mult=0.5),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=3e-3)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = 'work_dirs/maptracker_av2_oldsplit_5frame_span10_stage2_warmup/latest.pth'


================================================
FILE: plugin/configs/maptracker/nuscenes_newsplit/maptracker_nusc_newsplit_5frame_span10_stage1_bev_pretrain.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 480
img_w = 800
img_size = (img_h, img_w)
num_cams = 6

num_gpus = 8
batch_size = 3
num_iters_per_epoch = 27846 // (num_gpus * batch_size)
num_epochs = 18
num_epochs_interval = num_epochs // 6
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=True,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=False,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
            semantic_mask=True,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)


match_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'ego2cam', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name', 'img_filenames', 'cam_intrinsics', 'cam_extrinsics', 'lidar2ego_translation', 
        'lidar2ego_rotation'])
    ],
    interval=1,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        sampling_span=10,
    ),
    val=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        eval_semantic=True,
    ),
    test=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        eval_semantic=True,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=5e-2)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

================================================
FILE: plugin/configs/maptracker/nuscenes_newsplit/maptracker_nusc_newsplit_5frame_span10_stage2_warmup.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 480
img_w = 800
img_size = (img_h, img_w)

num_gpus = 8
batch_size = 8
num_iters_per_epoch = 27846 // (num_gpus * batch_size)
num_epochs = 4
num_epochs_interval = num_epochs
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=True,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        history_steps=4,
        use_grid_mask=True,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            #in_channels=[128, 256, 512],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                        )
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    # operation_order=('norm', 'self_attn', 'norm', 'cross_attn',
                    #                 'norm', 'ffn',)
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)


match_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=10,
    train=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        sampling_span=10,
    ),
    val=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    test=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=0.95)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = "work_dirs/maptracker_nusc_newsplit_5frame_span10_stage1_bev_pretrain/latest.pth"


================================================
FILE: plugin/configs/maptracker/nuscenes_newsplit/maptracker_nusc_newsplit_5frame_span10_stage3_joint_finetune.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 480
img_w = 800
img_size = (img_h, img_w)

num_gpus = 8
batch_size = 4
num_iters_per_epoch = 27846 // (num_gpus * batch_size)
num_epochs = 36
num_epochs_interval = num_epochs // 6
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=-1,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        history_steps=4,
        use_grid_mask=True,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                        )
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    # operation_order=('norm', 'self_attn', 'norm', 'cross_attn',
                    #                 'norm', 'ffn',)
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)


match_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_train_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        sampling_span=10,
    ),
    val=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    test=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'backbone.img_backbone': dict(lr_mult=0.1),
            'backbone.img_neck': dict(lr_mult=0.5),
            'backbone.transformer': dict(lr_mult=0.5),
            'backbone.positional_encoding': dict(lr_mult=0.5),
            'seg_decoder': dict(lr_mult=0.5),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=3e-3)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = "work_dirs/maptracker_nusc_newsplit_5frame_span10_stage2_warmup/latest.pth"


================================================
FILE: plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 480
img_w = 800
img_size = (img_h, img_w)
num_cams = 6

num_gpus = 8
batch_size = 1
num_iters_per_epoch = 27968 // (num_gpus * batch_size)
num_epochs = 18
num_epochs_interval = num_epochs // 6
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=True,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=False,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        use_grid_mask=True,
        history_steps=4,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims),
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 
                                     'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    # operation_order=('norm', 'self_attn', 'norm', 'cross_attn',
                    #                 'norm', 'ffn',)
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
            semantic_mask=True,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)


match_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'ego2cam', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name', 'img_filenames', 'cam_intrinsics', 'cam_extrinsics', 'lidar2ego_translation', 
        'lidar2ego_rotation'])
    ],
    interval=1,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_train.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        sampling_span=10,
    ),
    val=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        eval_semantic=True,
    ),
    test=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
        eval_semantic=True,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=5e-2)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True


================================================
FILE: plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage2_warmup.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 480
img_w = 800
img_size = (img_h, img_w)

num_gpus = 8
batch_size = 6
num_iters_per_epoch = 27968 // (num_gpus * batch_size)
num_epochs = 4
num_epochs_interval = num_epochs
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=True,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=500,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        history_steps=4,
        use_grid_mask=True,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                        )
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    # operation_order=('norm', 'self_attn', 'norm', 'cross_attn',
                    #                 'norm', 'ffn',)
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)


match_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_train.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        sampling_span=10,
    ),
    val=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    test=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=0.95) # only slightly decay

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = "work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain/latest.pth"


================================================
FILE: plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py
================================================
_base_ = [
    '../../_base_/default_runtime.py'
]

# model type
type = 'Mapper'
plugin = True

# plugin code dir
plugin_dir = 'plugin/'
[]
# img configs
img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)

img_h = 480
img_w = 800
img_size = (img_h, img_w)

num_gpus = 8
batch_size = 2
num_iters_per_epoch = 27968 // (num_gpus * batch_size)
num_epochs = 48
num_epochs_interval = num_epochs // 8
total_iters = num_epochs * num_iters_per_epoch
num_queries = 100

# category configs
cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}
num_class = max(list(cat2id.values())) + 1

# bev configs
roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis
bev_h = 50
bev_w = 100
pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5]

# vectorize params
coords_dim = 2
sample_dist = -1
sample_num = -1
simplify = True

# rasterize params (for temporal matching use)
canvas_size = (200, 100) # bev feature size
thickness = 3 # thickness of rasterized polylines

# meta info for submission pkl
meta = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False,
    output_format='vector')

# model configs
bev_embed_dims = 256
embed_dims = 512
num_feat_levels = 3
norm_cfg = dict(type='BN2d')
num_class = max(list(cat2id.values()))+1
num_points = 20
permute = True

model = dict(
    type='MapTracker',
    roi_size=roi_size,
    bev_h=bev_h,
    bev_w=bev_w,
    history_steps=4,
    test_time_history_steps=20,
    mem_select_dist_ranges=[1, 5, 10, 15],
    skip_vector_head=False,
    freeze_bev=False,
    track_fp_aug=False,
    use_memory=True,
    mem_len=4,
    mem_warmup_iters=-1,
    backbone_cfg=dict(
        type='BEVFormerBackbone',
        roi_size=roi_size,
        bev_h=bev_h,
        bev_w=bev_w,
        history_steps=4,
        use_grid_mask=True,
        img_backbone=dict(
            type='ResNet',
            with_cp=False,
            # pretrained='./resnet50_checkpoint.pth',
            pretrained='open-mmlab://detectron2/resnet50_caffe',
            depth=50,
            num_stages=4,
            out_indices=(1, 2, 3),
            frozen_stages=-1,
            norm_cfg=norm_cfg,
            norm_eval=True,
            style='caffe',
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, False, True, True)
            ),
        img_neck=dict(
            type='FPN',
            in_channels=[512, 1024, 2048],
            out_channels=bev_embed_dims,
            start_level=0,
            add_extra_convs=True,
            num_outs=num_feat_levels,
            norm_cfg=norm_cfg,
            relu_before_extra_convs=True),
        transformer=dict(
            type='PerceptionTransformer',
            embed_dims=bev_embed_dims,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=2,
                pc_range=pc_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=bev_embed_dims,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=bev_embed_dims,
                                num_points=8,
                                num_levels=num_feat_levels),
                            embed_dims=bev_embed_dims,
                        )
                    ],
                    feedforward_channels=bev_embed_dims*2,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            ),
        ),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=bev_embed_dims//2,
            row_num_embed=bev_h,
            col_num_embed=bev_w,
            ),
    ),
    head_cfg=dict(
        type='MapDetectorHead',
        num_queries=num_queries,
        embed_dims=embed_dims,
        num_classes=num_class,
        in_channels=bev_embed_dims,
        num_points=num_points,
        roi_size=roi_size,
        coord_dim=2,
        different_heads=False,
        predict_refine=False,
        sync_cls_avg_factor=True,
        trans_loss_weight=0.1,
        transformer=dict(
            type='MapTransformer',
            num_feature_levels=1,
            num_points=num_points,
            coord_dim=2,
            encoder=dict(
                type='PlaceHolderEncoder',
                embed_dims=embed_dims,
            ),
            decoder=dict(
                type='MapTransformerDecoder_new',
                num_layers=6,
                prop_add_stage=1,
                return_intermediate=True,
                transformerlayers=dict(
                    type='MapTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            num_levels=1,
                            num_points=num_points,
                            dropout=0.1,
                        ),
                        dict(
                            type='MultiheadAttention',
                            embed_dims=embed_dims,
                            num_heads=8,
                            attn_drop=0.1,
                            proj_drop=0.1,
                        ),
                    ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=embed_dims,
                        feedforward_channels=embed_dims*2,
                        num_fcs=2,
                        ffn_drop=0.1,
                        act_cfg=dict(type='ReLU', inplace=True),        
                    ),
                    feedforward_channels=embed_dims*2,
                    ffn_dropout=0.1,
                    ## an addtional cross attention for vector memory fusion
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm',
                                    'ffn', 'norm')
                )
            )
        ),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=5.0
        ),
        loss_reg=dict(
            type='LinesL1Loss',
            loss_weight=50.0,
            beta=0.01,
        ),
        assigner=dict(
            type='HungarianLinesAssigner',
                cost=dict(
                    type='MapQueriesCost',
                    cls_cost=dict(type='FocalLossCost', weight=5.0),
                    reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute),
                    ),
                ),
        ),
    seg_cfg=dict(
        type='MapSegHead',
        num_classes=num_class,
        in_channels=bev_embed_dims,
        embed_dims=bev_embed_dims,
        bev_size=(bev_w, bev_h),
        canvas_size=canvas_size,
        loss_seg=dict(
            type='MaskFocalLoss',
            use_sigmoid=True,
            loss_weight=10.0,
        ),
        loss_dice=dict(
            type='MaskDiceLoss',
            loss_weight=1.0,
        )
    ),
    model_name='SingleStage'
)

# data processing pipelines
train_pipeline = [
    dict(
        type='VectorizeMap',
        coords_dim=coords_dim,
        roi_size=roi_size,
        sample_num=num_points,
        normalize=True,
        permute=permute,
    ),
    dict(
        type='RasterizeMap',   
        roi_size=roi_size,
        coords_dim=coords_dim,
        canvas_size=canvas_size,
        thickness=thickness,
        semantic_mask=True,
    ),
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# data processing pipelines
test_pipeline = [
    dict(type='LoadMultiViewImagesFromFiles', to_float32=True),
    dict(type='ResizeMultiViewImages',
         size=img_size, # H, W
         change_intrinsics=True,
         ),
    dict(type='Normalize3D', **img_norm_cfg),
    dict(type='PadMultiViewImages', size_divisor=32),
    dict(type='FormatBundleMap'),
    dict(type='Collect3D', keys=['img'], meta_keys=(
        'token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'))
]

# configs for evaluation code
# DO NOT CHANGE
eval_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=True,
            normalize=False,
            roi_size=roi_size
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)


match_config = dict(
    type='NuscDataset',
    data_root='./datasets/nuscenes',
    ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
    meta=meta,
    roi_size=roi_size,
    cat2id=cat2id,
    pipeline=[
        dict(
            type='VectorizeMap',
            coords_dim=coords_dim,
            simplify=False,
            normalize=True,
            roi_size=roi_size,
            sample_num=num_points,
        ),
        dict(
            type='RasterizeMap',   
            roi_size=roi_size,
            coords_dim=coords_dim,
            canvas_size=canvas_size,
            thickness=thickness,
        ),
        dict(type='FormatBundleMap'),
        dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation',
        'ego2global_rotation', 'img_shape', 'scene_name'])
    ],
    interval=1,
)

# dataset configs
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=8,
    train=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_train.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=train_pipeline,
        seq_split_num=-2,
        matching=True,
        multi_frame=5,
        sampling_span=10,
    ),
    val=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    test=dict(
        type='NuscDataset',
        data_root='./datasets/nuscenes',
        ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl',
        meta=meta,
        roi_size=roi_size,
        cat2id=cat2id,
        pipeline=test_pipeline,
        eval_config=eval_config,
        test_mode=True,
        seq_split_num=1,
    ),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler')
)

# optimizer
optimizer = dict(
    type='AdamW',
    lr=5e-4,
    paramwise_cfg=dict(
        custom_keys={
            'backbone.img_backbone': dict(lr_mult=0.1),
            'backbone.img_neck': dict(lr_mult=0.5),
            'backbone.transformer': dict(lr_mult=0.5),
            'backbone.positional_encoding': dict(lr_mult=0.5),
            'seg_decoder': dict(lr_mult=0.5),
        }),
    weight_decay=1e-2)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

# learning policy & schedule
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=3e-3)

evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch)
#evaluation = dict(interval=1) # for debugging use..
find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch)

runner = dict(
    type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch)

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])

SyncBN = True

load_from = "work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage2_warmup/latest.pth"


================================================
FILE: plugin/core/apis/__init__.py
================================================
from .train import custom_train_model
from .mmdet_train import custom_train_detector
# from .test import custom_multi_gpu_test

================================================
FILE: plugin/core/apis/mmdet_train.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
# ---------------------------------------------
#  Modified by Shihao Wang
# ---------------------------------------------
import random
import warnings

import numpy as np
import torch
import torch.distributed as dist
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, IterBasedRunner, RUNNERS,
                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
                         build_runner, get_dist_info)
from mmcv.utils import build_from_cfg

from mmdet.core import EvalHook

from mmdet.datasets import (build_dataset,
                            replace_ImageToTensor)
from mmdet.utils import get_root_logger
import time
import os.path as osp
from ...datasets.builder import build_dataloader
from ..evaluation.eval_hooks import CustomDistEvalHook


@RUNNERS.register_module()
class MyRunnerWrapper(IterBasedRunner):
    def train(self, data_loader, **kwargs):
        self.model.module.num_iter = self._iter
        self.model.train()
        self.mode = 'train'
        self.data_loader = data_loader
        self._epoch = data_loader.epoch
        self.model.module.num_epoch = self._epoch
        data_batch = next(data_loader)
        self.call_hook('before_train_iter')
        outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
        if not isinstance(outputs, dict):
            raise TypeError('model.train_step() must return a dict')
        if 'log_vars' in outputs:
            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
        self.outputs = outputs
        self.call_hook('after_train_iter')
        self._inner_iter += 1
        self._iter += 1


def custom_train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   eval_model=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
   
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    #assert len(dataset)==1s
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed,
            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
            runner_type=cfg.runner,
        ) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
        if eval_model is not None:
            eval_model = MMDistributedDataParallel(
                eval_model.cuda(),
                device_ids=[torch.cuda.current_device()],
                broadcast_buffers=False,
                find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(
            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
        if eval_model is not None:
            eval_model = MMDataParallel(
                eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)


    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    if 'runner' not in cfg:
        cfg.runner = {
            'type': 'EpochBasedRunner',
            'max_epochs': cfg.total_epochs
        }
        warnings.warn(
            'config is now expected to have a `runner` section, '
            'please set `runner` in your config.', UserWarning)
    else:
        if 'total_epochs' in cfg:
            assert cfg.total_epochs == cfg.runner.max_epochs
    if eval_model is not None:
        runner = build_runner(
            cfg.runner,
            default_args=dict(
                model=model,
                eval_model=eval_model,
                optimizer=optimizer,
                work_dir=cfg.work_dir,
                logger=logger,
                meta=meta))
    else:
        runner = build_runner(
            cfg.runner,
            default_args=dict(
                model=model,
                optimizer=optimizer,
                work_dir=cfg.work_dir,
                logger=logger,
                meta=meta))

    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    
    # register profiler hook
    #trace_config = dict(type='tb_trace', dir_name='work_dir')
    #profiler_config = dict(on_trace_ready=trace_config)
    #runner.register_profiler_hook(profiler_config)
    
    if distributed:
        if isinstance(runner, EpochBasedRunner):
            runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            assert False
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False,
            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
        )
        eval_cfg = cfg.get('evaluation', {})
        #eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
        eval_cfg['by_epoch'] = not isinstance(runner, IterBasedRunner)
        eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
        eval_hook = CustomDistEvalHook if distributed else EvalHook

        runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW')

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow)


================================================
FILE: plugin/core/apis/test.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
import os.path as osp
import pickle
import shutil
import tempfile
import time

import mmcv
import torch
import torch.distributed as dist
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info

from mmdet.core import encode_mask_results


import mmcv
import numpy as np
import pycocotools.mask as mask_util

def custom_encode_mask_results(mask_results):
    """Encode bitmap mask to RLE code. Semantic Masks only
    Args:
        mask_results (list | tuple[list]): bitmap mask results.
            In mask scoring rcnn, mask_results is a tuple of (segm_results,
            segm_cls_score).
    Returns:
        list | tuple: RLE encoded mask.
    """
    cls_segms = mask_results
    num_classes = len(cls_segms)
    encoded_mask_results = []
    for i in range(len(cls_segms)):
        encoded_mask_results.append(
            mask_util.encode(
                np.array(
                    cls_segms[i][:, :, np.newaxis], order='F',
                        dtype='uint8'))[0])  # encoded with RLE
    return [encoded_mask_results]

def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
    """Test model with multiple gpus.
    This method tests model with multiple gpus and collects the results
    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
    it encodes results to gpu tensors and use gpu communication for results
    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
    and collects them by the rank 0 worker.
    Args:
        model (nn.Module): Model to be tested.
        data_loader (nn.Dataloader): Pytorch data loader.
        tmpdir (str): Path of directory to save the temporary results from
            different gpus under cpu mode.
        gpu_collect (bool): Option to use either gpu or cpu to collect results.
    Returns:
        list: The prediction results.
    """
    model.eval()
    bbox_results = []
    mask_results = []
    dataset = data_loader.dataset
    rank, world_size = get_dist_info()
    if rank == 0:
        prog_bar = mmcv.ProgressBar(len(dataset))
    time.sleep(2)  # This line can prevent deadlock problem in some cases.
    have_mask = False
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)
            # encode mask results
            if isinstance(result, dict):
                if 'bbox_results' in result.keys():
                    bbox_result = result['bbox_results']
                    batch_size = len(result['bbox_results'])
                    bbox_results.extend(bbox_result)
                if 'mask_results' in result.keys() and result['mask_results'] is not None:
                    mask_result = custom_encode_mask_results(result['mask_results'])
                    mask_results.extend(mask_result)
                    have_mask = True
            else:
                batch_size = len(result)
                bbox_results.extend(result)

        if rank == 0:
            
            for _ in range(batch_size * world_size):
                prog_bar.update()

    # collect results from all ranks
    if gpu_collect:
        bbox_results = collect_results_gpu(bbox_results, len(dataset))
        if have_mask:
            mask_results = collect_results_gpu(mask_results, len(dataset))
        else:
            mask_results = None
    else:
        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
        if have_mask:
            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
        else:
            mask_results = None

    if mask_results is None:
        return bbox_results
    return {'bbox_results': bbox_results, 'mask_results': mask_results}


def collect_results_cpu(result_part, size, tmpdir=None):
    rank, world_size = get_dist_info()
    # create a tmp dir if it is not specified
    if tmpdir is None:
        MAX_LEN = 512
        # 32 is whitespace
        dir_tensor = torch.full((MAX_LEN, ),
                                32,
                                dtype=torch.uint8,
                                device='cuda')
        if rank == 0:
            mmcv.mkdir_or_exist('.dist_test')
            tmpdir = tempfile.mkdtemp(dir='.dist_test')
            tmpdir = torch.tensor(
                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
            dir_tensor[:len(tmpdir)] = tmpdir
        dist.broadcast(dir_tensor, 0)
        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
    else:
        mmcv.mkdir_or_exist(tmpdir)
    # dump the part result to the dir
    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
    dist.barrier()
    # collect all parts
    if rank != 0:
        return None
    else:
        # load results of all parts from tmp dir
        part_list = []
        for i in range(world_size):
            part_file = osp.join(tmpdir, f'part_{i}.pkl')
            part_list.append(mmcv.load(part_file))
        # sort the results
        ordered_results = []
        '''
        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
        '''
        #for res in zip(*part_list):
        for res in part_list:  
            ordered_results.extend(list(res))
        # the dataloader may pad some samples
        print(f'\ntruncate {size} samples from {len(ordered_results)}')
        ordered_results = ordered_results[:size]
        # remove tmp dir
        shutil.rmtree(tmpdir)
        return ordered_results


def collect_results_gpu(result_part, size):
    collect_results_cpu(result_part, size)

================================================
FILE: plugin/core/apis/train.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
# ---------------------------------------------
#  Modified by Shihao Wang
# ---------------------------------------------

from .mmdet_train import custom_train_detector
from mmseg.apis import train_segmentor
from mmdet.apis import train_detector

def custom_train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                eval_model=None,
                meta=None):
    """A function wrapper for launching model training according to cfg.

    Because we need different eval_hook in runner. Should be deprecated in the
    future.
    """
    if cfg.model.type in ['EncoderDecoder3D']:
        assert False
    else:
        custom_train_detector(
            model,
            dataset,
            cfg,
            distributed=distributed,
            validate=validate,
            timestamp=timestamp,
            eval_model=eval_model,
            meta=meta)


def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    """A function wrapper for launching model training according to cfg.

    Because we need different eval_hook in runner. Should be deprecated in the
    future.
    """
    if cfg.model.type in ['EncoderDecoder3D']:
        train_segmentor(
            model,
            dataset,
            cfg,
            distributed=distributed,
            validate=validate,
            timestamp=timestamp,
            meta=meta)
    else:
        train_detector(
            model,
            dataset,
            cfg,
            distributed=distributed,
            validate=validate,
            timestamp=timestamp,
            meta=meta)


================================================
FILE: plugin/core/evaluation/__init__.py
================================================
from .eval_hooks import CustomDistEvalHook

================================================
FILE: plugin/core/evaluation/eval_hooks.py
================================================

# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
# in order to avoid strong version dependency, we did not directly
# inherit EvalHook but BaseDistEvalHook.

import bisect
import os.path as osp

import mmcv
import torch.distributed as dist
from mmcv.runner import DistEvalHook as BaseDistEvalHook
from mmcv.runner import EvalHook as BaseEvalHook
from torch.nn.modules.batchnorm import _BatchNorm
from mmdet.core.evaluation.eval_hooks import DistEvalHook


def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
    assert mmcv.is_list_of(dynamic_interval_list, tuple)

    dynamic_milestones = [0]
    dynamic_milestones.extend(
        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
    dynamic_intervals = [start_interval]
    dynamic_intervals.extend(
        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
    return dynamic_milestones, dynamic_intervals


class CustomDistEvalHook(BaseDistEvalHook):

    def __init__(self, *args, dynamic_intervals=None,  **kwargs):
        super(CustomDistEvalHook, self).__init__(*args, **kwargs)
        self.use_dynamic_intervals = dynamic_intervals is not None
        if self.use_dynamic_intervals:
            self.dynamic_milestones, self.dynamic_intervals = \
                _calc_dynamic_intervals(self.interval, dynamic_intervals)

    def _decide_interval(self, runner):
        if self.use_dynamic_intervals:
            progress = runner.epoch if self.by_epoch else runner.iter
            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
            # Dynamically modify the evaluation interval
            self.interval = self.dynamic_intervals[step - 1]

    def before_train_epoch(self, runner):
        """Evaluate the model only at the start of training by epoch."""
        self._decide_interval(runner)
        super().before_train_epoch(runner)

    def before_train_iter(self, runner):
        self._decide_interval(runner)
        super().before_train_iter(runner)

    def _do_evaluate(self, runner):
        """perform evaluation and save ckpt."""
        # Synchronization of BatchNorm's buffer (running_mean
        # and running_var) is not supported in the DDP of pytorch,
        # which may cause the inconsistent performance of models in
        # different ranks, so we broadcast BatchNorm's buffers
        # of rank 0 to other ranks to avoid this.
        if self.broadcast_bn_buffer:
            model = runner.model
            for name, module in model.named_modules():
                if isinstance(module,
                              _BatchNorm) and module.track_running_stats:
                    dist.broadcast(module.running_var, 0)
                    dist.broadcast(module.running_mean, 0)

        if not self._should_evaluate(runner):
            return

        tmpdir = self.tmpdir
        if tmpdir is None:
            tmpdir = osp.join(runner.work_dir, '.eval_hook')

        from ..apis.test import custom_multi_gpu_test # to solve circlur  import

        results = custom_multi_gpu_test(
            runner.model,
            self.dataloader,
            tmpdir=tmpdir,
            gpu_collect=self.gpu_collect)
        
        if runner.rank == 0:
            print('\n')
            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)

            key_score = self.evaluate(runner, results)

            if self.save_best:
                self._save_ckpt(runner, key_score)
  

================================================
FILE: plugin/datasets/__init__.py
================================================
from .pipelines import *
from .argo_dataset import AV2Dataset
from .nusc_dataset import NuscDataset


================================================
FILE: plugin/datasets/argo_dataset.py
================================================
from .base_dataset import BaseMapDataset
from .map_utils.av2map_extractor import AV2MapExtractor
from mmdet.datasets import DATASETS
import numpy as np
from .visualize.renderer import Renderer
from time import time
import mmcv
from pyquaternion import Quaternion

import pickle
import os


@DATASETS.register_module()
class AV2Dataset(BaseMapDataset):
    """Argoverse2 map dataset class.

    Args:
        ann_file (str): annotation file path
        cat2id (dict): category to class id
        roi_size (tuple): bev range
        eval_config (Config): evaluation config
        meta (dict): meta information
        pipeline (Config): data processing pipeline config,
        interval (int): annotation load interval
        work_dir (str): path to work dir
        test_mode (bool): whether in test mode
    """

    def __init__(self, **kwargs,):
        super().__init__(**kwargs)
        self.map_extractor = AV2MapExtractor(self.roi_size, self.id2map)

        self.renderer = Renderer(self.cat2id, self.roi_size, 'av2')
    
    def load_annotations(self, ann_file):
        """Load annotations from ann_file.

        Args:
            ann_file (str): Path of the annotation file.

        Returns:
            list[dict]: List of annotations.
        """
        
        start_time = time()
        ann = mmcv.load(ann_file)
        self.id2map = ann['id2map']
        samples = ann['samples']

        if 'newsplit' not in ann_file:
            if 'val' in ann_file:
                # For the old split testing, we make sure that the test set matches exactly with the MapTR codebase
                # NOTE: simply sort&sampling will produce slightly different results compared to MapTR's samples
                # so we have to directly use the saved meta information from MapTR codebase to get the samples
                maptr_meta_path = os.path.join(os.path.dirname(ann_file), 'maptrv2_val_samples_info.pkl')
                with open(maptr_meta_path, 'rb') as f:
                    maptr_meta = pickle.load(f)
                maptr_unique_tokens = [x['token'] for x in maptr_meta['samples_meta']]

                unique_token2samples = {}
                for sample in samples:
                    unique_token2samples[f'{sample["log_id"]}_{sample["token"]}'] = sample

                samples = [unique_token2samples[x] for x in maptr_unique_tokens]
            else:
                # For the old split training, we follow MapTR's data loading, which
                # sorts the samples based on the token, then do sub-sampling
                samples = list(sorted(samples, key=lambda e: e['token']))
                samples = samples[::self.interval]
        else:
            # For the new split, we simply follow StreamMapNet, do not sort based on the token
            # In this way, the intervals between consecutive frames are uniform...
            samples = samples[::self.interval]

        # Since the sorted order copied from MapTR does not strictly enforce that
        # samples of the same scene are consecutive, need to re-arrange
        scene_name2idx = {}
        for idx, sample in enumerate(samples):
            scene = sample['log_id']
            if scene not in scene_name2idx:
                scene_name2idx[scene] = []
            scene_name2idx[scene].append(idx)

        samples_rearrange = []
        for scene_name in scene_name2idx:
            scene_sample_ids = scene_name2idx[scene_name]
            for sample_id in scene_sample_ids:
                samples_rearrange.append(samples[sample_id])
        
        samples = samples_rearrange

        print(f'collected {len(samples)} samples in {(time() - start_time):.2f}s')
        self.samples = samples

    def load_matching(self, matching_file):
        with open(matching_file, 'rb') as pf:
            data = pickle.load(pf)
        total_samples = 0
        for scene_name, info in data.items():
            total_samples += len(info['sample_ids'])

        assert total_samples == len(self.samples), 'Matching info not matched with data samples'
        self.matching_meta = data
        print(f'loaded matching meta for {len(data)} scenes')

    def get_sample(self, idx):
        """Get data sample. For each sample, map extractor will be applied to extract 
        map elements. 

        Args:
            idx (int): data index

        Returns:
            result (dict): dict of input
        """

        sample = self.samples[idx]
        log_id = sample['log_id']
        map_geoms = self.map_extractor.get_map_geom(log_id, sample['e2g_translation'], 
                sample['e2g_rotation'])

        map_label2geom = {}
        for k, v in map_geoms.items():
            if k in self.cat2id.keys():
                map_label2geom[self.cat2id[k]] = v
        
        ego2img_rts = []
        for c in sample['cams'].values():
            extrinsic, intrinsic = np.array(
                c['extrinsics']), np.array(c['intrinsics'])
            ego2cam_rt = extrinsic
            viewpad = np.eye(4)
            viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
            ego2cam_rt = (viewpad @ ego2cam_rt)
            ego2img_rts.append(ego2cam_rt)


        # pdb.set_trace()

        input_dict = {
            'token': sample['token'],
            'img_filenames': [c['img_fpath'] for c in sample['cams'].values()],
            # intrinsics are 3x3 Ks
            'cam_intrinsics': [c['intrinsics'] for c in sample['cams'].values()],
            # extrinsics are 4x4 tranform matrix, NOTE: **ego2cam**
            'cam_extrinsics': [c['extrinsics'] for c in sample['cams'].values()],
            'ego2img': ego2img_rts,
            'map_geoms': map_label2geom, # {0: List[ped_crossing(LineString)], 1: ...}
            'ego2global_translation': sample['e2g_translation'], 
            'ego2global_rotation': sample['e2g_rotation'].tolist(),
            'sample_idx': sample['modified_sample_idx'],
            'scene_name': sample['scene_name'],
            'lidar_path': sample['lidar_fpath']
        }

        return input_dict

================================================
FILE: plugin/datasets/base_dataset.py
================================================
import numpy as np
import os
import os.path as osp
import mmcv
from .evaluation.raster_eval import RasterEvaluate
from .evaluation.vector_eval import VectorEvaluate
from mmdet3d.datasets.pipelines import Compose
from mmdet.datasets import DATASETS
from torch.utils.data import Dataset
from mmcv.parallel import DataContainer as DC
import warnings
import pickle


warnings.filterwarnings("ignore")

@DATASETS.register_module()
class BaseMapDataset(Dataset):
    """Map dataset base class.

    Args:
        ann_file (str): annotation file path
        cat2id (dict): category to class id
        roi_size (tuple): bev range
        eval_config (Config): evaluation config
        meta (dict): meta information
        pipeline (Config): data processing pipeline config,
        interval (int): annotation load interval
        work_dir (str): path to work dir
        test_mode (bool): whether in test mode
    """
    def __init__(self, 
                 ann_file,
                 cat2id,
                 roi_size,
                 meta,
                 pipeline,
                 interval=1,
                 seq_split_num=1,
                 work_dir=None,
                 eval_config=None,
                 test_mode=False,
                 multi_frame=False,
                 sampling_span=10,
                 matching=False,
                 eval_semantic=False,
        ):
        super().__init__()
        self.ann_file = ann_file
        self.multi_frame = multi_frame
        self.sampling_span = sampling_span
        self.matching = matching
        
        self.meta = meta
        self.classes = list(cat2id.keys())
        self.num_classes = len(self.classes)
        self.cat2id = cat2id
        self.interval = interval
        self.seq_split_num = seq_split_num
        self.eval_semantic = eval_semantic

        self.load_annotations(self.ann_file)

        if matching:
            assert self.multi_frame, 'The matching info has to loaded under the multi-frame setting'
            self.matching_file = ann_file[:-4] + '_gt_tracks.pkl'
            assert os.path.isfile(self.matching_file)
            self.load_matching(self.matching_file)
        
        self.idx2token = {}
        for i, s in enumerate(self.samples):
            self.idx2token[i] = s['token']
        self.token2idx = {v: k for k, v in self.idx2token.items()}

        if pipeline is not None:
            self.pipeline = Compose(pipeline)
        else:
            self.pipeline = None
        
        # dummy flags to fit with mmdet dataset
        self.flag = np.zeros(len(self), dtype=np.uint8)

        self.roi_size = roi_size
        
        self.work_dir = work_dir
        self.eval_config = eval_config
        if self.eval_config is not None:
            assert test_mode, "eval_config is valid only in test_mode"

        # record the sequence information, prepare for two-frame data loading
        self._set_sequence_info()
        
        self._set_sequence_group_flag()
        
    
    def _set_sequence_info(self):
        """Compute and record the sequence id and local index of each sample
        """
        scene_name2idx = {}
        for idx, sample in enumerate(self.samples):
            self.samples[idx]['modified_sample_idx'] = idx
            scene = sample['scene_name']
            if scene not in scene_name2idx:
                scene_name2idx[scene] = []
                self.samples[idx]['prev'] = -1

            scene_name2idx[scene].append(idx)
        self.scene_name2idx = scene_name2idx

        print('Prepare sequence information for {}'.format(self.ann_file))
        idx2scene = {}
        for scene_name, scene_info in scene_name2idx.items():
            for local_idx, global_idx in enumerate(scene_info):
                idx2scene[global_idx] = (scene_name, local_idx, len(scene_info))
        self.idx2scene = idx2scene

    def _set_sequence_group_flag(self):
        """
        Set each sequence to be a different group
        """
        if self.seq_split_num == -1:
            self.flag = np.arange(len(self.samples))
            return
        elif self.seq_split_num == -2:
            return
        
        res = []

        curr_sequence = -1
        for idx in range(len(self.samples)):
            if self.samples[idx]['prev'] == -1:
                # new sequence
                curr_sequence += 1
            res.append(curr_sequence)

        self.flag = np.array(res, dtype=np.int64)

        if self.seq_split_num != 1:
            bin_counts = np.bincount(self.flag)
            new_flags = []
            curr_new_flag = 0
            for curr_flag in range(len(bin_counts)):
                seq_length = int(round(bin_counts[curr_flag] / self.seq_split_num))
                curr_sequence_length = list(range(0, bin_counts[curr_flag], seq_length)) + [bin_counts[curr_flag]]
                
                # if left one sample, put it into the last sequence
                if curr_sequence_length[-1] - curr_sequence_length[-2] <= 1:
                    curr_sequence_length = curr_sequence_length[:-2] + [curr_sequence_length[-1]]
                
                curr_sequence_length = np.array(curr_sequence_length)

                for sub_seq_idx in (curr_sequence_length[1:] - curr_sequence_length[:-1]):
                    for _ in range(sub_seq_idx):
                        new_flags.append(curr_new_flag)
                    curr_new_flag += 1

            assert len(new_flags) == len(self.flag)
            # assert len(np.bincount(new_flags)) == len(np.bincount(self.flag)) * self.seq_split_num
            self.flag = np.array(new_flags, dtype=np.int64)

    def load_annotations(self, ann_file):
        raise NotImplementedError
    
    def load_matching(self, matching_file):
        raise NotImplementedError

    def get_sample(self, idx):
        raise NotImplementedError

    def format_results(self, results, denormalize=True, prefix=None, save_semantic=False):
        '''Format prediction result to submission format.
        
        Args:
            results (list[Tensor]): List of prediction results.
            denormalize (bool): whether to denormalize prediction from (0, 1) \
                to bev range. Default: True
            prefix (str): work dir prefix to save submission file.

        Returns:
            dict: Evaluation results
        '''

        meta = self.meta
        output_format = meta['output_format']
        submissions = {
            'meta': meta,
            'results': {},
        }

        if output_format == 'raster':
            for pred in results:
                single_case = {}
                token = pred['token']
                pred_map = pred['semantic_mask']
                pred_bool = pred_map > 0
                single_case['semantic_mask'] = pred_bool.bool()
                submissions['results'][token] = single_case
            
            # Use pickle format to minimize submission file size.
            out_path = osp.join(prefix, 'submission_raster.pkl')
            print(f'saving submissions results to {out_path}')
            os.makedirs(os.path.dirname(out_path), exist_ok=True)
            mmcv.dump(submissions, out_path)
            return out_path

        elif output_format == 'vector':
            all_pos_results = []
            for pred in results:
                '''
                For each case, the result should be formatted as Dict{'vectors': [], 'scores': [], 'labels': []}
                'vectors': List of vector, each vector is a array([[x1, y1], [x2, y2] ...]),
                    contain all vectors predicted in this sample.
                'scores: List of score(float), 
                    contain scores of all instances in this sample.
                'labels': List of label(int), 
                    contain labels of all instances in this sample.
                '''
                if pred is None: # empty prediction
                    continue
                
                single_case = {'vectors': [], 'scores': [], 'labels': [], 'props': [],
                        'track_vectors': [], 'track_scores': [], 'track_labels': []}
                token = pred['token']
                roi_size = np.array(self.roi_size)
                origin = -np.array([self.roi_size[0]/2, self.roi_size[1]/2])
                
                # save the extra semantic info
                if save_semantic:
                    single_case['semantic_mask'] = pred['semantic_mask'].tolist()

                if 'scores' in pred:
                    for i in range(len(pred['scores'])):
                        score = pred['scores'][i]
                        label = pred['labels'][i]
                        vector = pred['vectors'][i]
                        prop = pred['props'][i]

                        # A line should have >=2 points
                        if len(vector) < 2:
                            continue
                        
                        if denormalize:
                            eps = 1e-5
                            vector = vector * (roi_size + eps) + origin

                        single_case['vectors'].append(vector)
                        single_case['scores'].append(score)
                        single_case['labels'].append(label)
                        single_case['props'].append(prop)
                
                if 'track_scores' in pred:
                # also save the tracking information for analyzing
                    for i in range(len(pred['track_scores'])):
                        score = pred['track_scores'][i]
                        label = pred['track_labels'][i]
                        vector = pred['track_vectors'][i]
                        if denormalize:
                            eps = 1e-5
                            vector = vector * (roi_size + eps) + origin
                        single_case['track_vectors'].append(vector)
                        single_case['track_scores'].append(score)
                        single_case['track_labels'].append(label)

                submissions['results'][token] = single_case
                
                if not self.eval_semantic:
                    pos_results = pred['pos_results']
                    pos_vectors = pos_results['vectors']
                    if denormalize and len(pos_vectors) > 0:
                        pos_vectors = pos_vectors.reshape(pos_vectors.shape[0], -1, 2)
                        pos_vectors = (pos_vectors * roi_size + origin).reshape(pos_vectors.shape[0], -1)
                    save_pos_results = {
                        'vectors': pos_vectors,
                        'labels': pos_results['labels'],
                        'scores': pos_results['scores'],
                        'scene_name': pos_results['scene_name'],
                        'local_idx': pos_results['local_idx'],
                        'global_ids': pos_results['global_ids'],
                        'meta': pred['meta']
                    }
                    all_pos_results.append(save_pos_results)
                
            out_path = osp.join(prefix, 'submission_vector.json')
            print(f'saving submissions results to {out_path}')
            os.makedirs(os.path.dirname(out_path), exist_ok=True)
            mmcv.dump(submissions, out_path)

            if not self.eval_semantic:
                out_path_pos = osp.join(prefix, 'pos_predictions.pkl')
                with open(out_path_pos, 'wb') as f:
                    pickle.dump(all_pos_results, f, protocol=pickle.HIGHEST_PROTOCOL)
            
            return out_path

        else:
            raise ValueError("output format must be either \'raster\' or \'vector\'")

    def evaluate(self, results, logger=None, **kwargs):
        '''Evaluate prediction result based on `output_format` specified by dataset.

        Args:
            results (list[Tensor]): List of prediction results.
            logger (logger): logger to print evaluation results.

        Returns:
            dict: Evaluation results.
        '''
        print('len of the results', len(results))

        eval_semantic = True if (hasattr(self, 'eval_semantic') and self.eval_semantic) else False
        save_semantic = True if 'save_semantic' in kwargs and kwargs['save_semantic'] or eval_semantic \
                            else False
        
        result_path = self.format_results(results, denormalize=True, 
                        prefix=self.work_dir, save_semantic=save_semantic)

        return self._evaluate(result_path, logger=logger, eval_semantic=eval_semantic)
    
    def _evaluate(self, result_path, logger=None, eval_semantic=False):
        if not eval_semantic:
            self.evaluator = VectorEvaluate(self.eval_config)
        else:
            self.evaluator = RasterEvaluate(self.eval_config)
        result_dict = self.evaluator.evaluate(result_path, logger=logger)
        return result_dict

    def show_gt(self, idx, out_dir='demo/'):
        '''Visualize ground-truth.

        Args:
            idx (int): index of sample.
            out_dir (str): output directory.
        '''

        from mmcv.parallel import DataContainer
        from copy import deepcopy
        sample = self.get_sample(idx)
        sample = deepcopy(sample)
        data = self.pipeline(sample)

        #imgs = [mmcv.imread(i) for i in sample['img_filenames']]
        #cam_extrinsics = sample['cam_extrinsics']
        #cam_intrinsics = sample['cam_intrinsics']

        if 'vectors' in data:
            vectors = data['vectors']
            if isinstance(vectors, DataContainer):
                vectors = vectors.data

            self.renderer.render_bev_from_vectors(vectors, out_dir)
            #self.renderer.render_camera_views_from_vectors(vectors, imgs, 
            #    cam_extrinsics, cam_intrinsics, 2, out_dir)

        if 'semantic_mask' in data:
            semantic_mask = data['semantic_mask']
            if isinstance(semantic_mask, DataContainer):
                semantic_mask = semantic_mask.data
            
            self.renderer.render_bev_from_mask(semantic_mask, out_dir, flip=True)

    def show_result(self, submission, idx, score_thr=0, draw_score=False, show_semantic=False, out_dir='demo/'):
        '''Visualize prediction result.

        Args:
            idx (int): index of sample.
            submission (dict): prediction results.
            score_thr (float): threshold to filter prediction results.
            out_dir (str): output directory.
        '''

        meta = submission['meta']
        output_format = meta['output_format']
        token = self.idx2token[idx]
        results = submission['results'][token]
        sample = self.get_sample(idx)


        if 'semantic_mask' in results and show_semantic:
            semantic_mask = np.array(results['semantic_mask'])
            self.renderer.render_bev_from_mask(semantic_mask, out_dir, flip=False)
        
        if output_format == 'vector' and 'scores' in results:
            vectors = {label: [] for label in self.cat2id.values()}
            for i in range(len(results['labels'])):
                score = results['scores'][i]
                label = results['labels'][i]
                prop = results['props'][i]
                v = results['vectors'][i]

                if score > score_thr:
                    if draw_score:
                        vectors[label].append((v, score, prop))
                    else:
                        vectors[label].append(v)

            self.renderer.render_bev_from_vectors(vectors, out_dir, draw_scores=draw_score)

            # For projecting and visualizing results on perspective images
            #imgs = [mmcv.imread(i) for i in sample['img_filenames']]
            #cam_extrinsics = sample['cam_extrinsics']
            #cam_intrinsics = sample['cam_intrinsics']
            # self.renderer.render_camera_views_from_vectors(vectors, imgs, 
            #         cam_extrinsics, cam_intrinsics, 2, out_dir)
    

    def show_track(self, submission, idx, out_dir='demo/'):
        '''Visualize prediction result.

        Args:
            idx (int): index of sample.
            submission (dict): prediction results.
            score_thr (float): threshold to filter prediction results.
            out_dir (str): output directory.
        '''

        meta = submission['meta']
        token = self.idx2token[idx]
        results = submission['results'][token]

        vectors = {label: [] for label in self.cat2id.values()}
        for i in range(len(results['track_labels'])):
            score = results['track_scores'][i]
            label = results['track_labels'][i]
            v = results['track_vectors'][i]
            vectors[label].append((v, score, 1))
        
        self.renderer.render_bev_from_vectors(vectors, out_dir, draw_scores=True)

    def __len__(self):
        """Return the length of data infos.

        Returns:
            int: Length of data infos.
        """
        return len(self.samples)
        
    def _rand_another(self, idx):
        """Randomly get another item.

        Returns:
            int: Another index of item.
        """
        return np.random.choice(self.__len__)

    def __getitem__(self, idx):
        """Get item from infos according to the given index.

        Returns:
            dict: Data dictionary of the corresponding index.
        """
        input_dict = self.get_sample(idx)
        data = self.pipeline(input_dict)

        # prepare the local sequence index info
        seq_info = self.idx2scene[idx]
        data['seq_info'] = DC(seq_info, cpu_only=True)

        if self.multi_frame: # used when sampling multi-frame training data
            scene_name = input_dict['scene_name']
            scene_seq_info = self.scene_name2idx[scene_name]
            local_idx_curr = input_dict['sample_idx'] - scene_seq_info[0]

            span = max(self.sampling_span, self.multi_frame)
            min_idx = local_idx_curr - span
            sampled_indices = np.random.choice(span, self.multi_frame-1, replace=False).tolist()
            sampled_indices = sorted(sampled_indices)
            local_indices_prev = [min_idx + x for x in sampled_indices]
            local_indices_prev = [x if x>=0 else 0 for x in local_indices_prev]
            
            data['img_metas'].data['local_idx'] = local_idx_curr
            global_indices_prev = [local_idx + scene_seq_info[0] for local_idx in local_indices_prev]

            all_prev_data = []
            for idx, global_idx_prev in enumerate(global_indices_prev):
                input_dict_prev = self.get_sample(global_idx_prev)
                data_prev = self.pipeline(input_dict_prev)
                local_idx_prev = local_indices_prev[idx]
                data_prev['img_metas'].data['local_idx'] = local_idx_prev
                all_prev_data.append(data_prev)

            all_local2global_info = []
            if self.matching:
                scene_matching_info = self.matching_meta[scene_name]
                for local_idx_prev in local_indices_prev:
                    prev_local2global = DC(scene_matching_info['instance_ids'][local_idx_prev], cpu_only=True)
                    all_local2global_info.append(prev_local2global)
            curr_local2global = DC(scene_matching_info['instance_ids'][local_idx_curr], cpu_only=True)
            all_local2global_info.append(curr_local2global)
                    
            data['all_prev_data'] = all_prev_data
            data['all_local2global_info'] = all_local2global_info
        
        return data


================================================
FILE: plugin/datasets/builder.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Shihao Wang
# ---------------------------------------------
import copy
import platform
import random
from functools import partial

import numpy as np
from mmcv.parallel import collate
from mmcv.runner import get_dist_info
from mmcv.utils import Registry, build_from_cfg
from torch.utils.data import DataLoader

from mmdet.datasets.samplers import GroupSampler
from .samplers.group_sampler import DistributedGroupSampler
from .samplers.distributed_sampler import DistributedSampler
from .samplers.group_sampler import InfiniteGroupEachSampleInBatchSampler
from .samplers.sampler import build_sampler

def build_dataloader(dataset,
                     samples_per_gpu,
                     workers_per_gpu,
                     num_gpus=1,
                     dist=True,
                     shuffle=True,
                     seed=None,
                     shuffler_sampler=None,
                     nonshuffler_sampler=None,
                     runner_type=dict(type='EpochBasedRunner'),
                     **kwargs):
    """Build PyTorch DataLoader.
    In distributed training, each GPU/process has a dataloader.
    In non-distributed training, there is only one dataloader for all GPUs.
    Args:
        dataset (Dataset): A PyTorch dataset.
        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
            batch size of each GPU.
        workers_per_gpu (int): How many subprocesses to use for data loading
            for each GPU.
        num_gpus (int): Number of GPUs. Only used in non-distributed training.
        dist (bool): Distributed training/test or not. Default: True.
        shuffle (bool): Whether to shuffle the data at every epoch.
            Default: True.
        kwargs: any keyword argument to be used to initialize DataLoader
    Returns:
        DataLoader: A PyTorch dataloader.
    """
    rank, world_size = get_dist_info()

    if dist:
        # DistributedGroupSampler will definitely shuffle the data to satisfy
        # that images on each GPU are in the same group
        if shuffle:
            sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
                                     dict(
                                         dataset=dataset,
                                         samples_per_gpu=samples_per_gpu,
                                         num_replicas=world_size,
                                         rank=rank,
                                         seed=seed)
                                     )
        else:
            sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
                                     dict(
                                         dataset=dataset,
                                         num_replicas=world_size,
                                         rank=rank,
                                         shuffle=shuffle,
                                         seed=seed)
                                     )

        batch_size = samples_per_gpu
        num_workers = workers_per_gpu
        batch_sampler = None


    else:
        # assert False, 'not support in bevformer'
        # print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
        batch_size = num_gpus * samples_per_gpu
        num_workers = num_gpus * workers_per_gpu
        batch_sampler = None

    # True entry here!!!
    if runner_type['type'] == 'IterBasedRunner' and shuffler_sampler['type'] =='InfiniteGroupEachSampleInBatchSampler':
        # TODO: original has more options, but I'm not using them 
        # https://github.com/open-mmlab/mmdetection/blob/3b72b12fe9b14de906d1363982b9fba05e7d47c1/mmdet/datasets/builder.py#L145-L157
        batch_sampler = build_sampler(shuffler_sampler, dict(
                                         dataset=dataset,
                                         samples_per_gpu=samples_per_gpu,
                                         num_replicas=world_size,
                                         rank=rank,
                                         seed=seed)
                                     )
        batch_size = 1 # Since we have batch sampler, the batch_size must = 1
        sampler = None


    init_fn = partial(
        worker_init_fn, num_workers=num_workers, rank=rank,
        seed=seed) if seed is not None else None

    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=sampler,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
        pin_memory=False,
        worker_init_fn=init_fn,
        **kwargs)

    return data_loader


def worker_init_fn(worker_id, num_workers, rank, seed):
    # The seed of each worker equals to
    # num_worker * rank + worker_id + user_seed
    worker_seed = num_workers * rank + worker_id + seed
    np.random.seed(worker_seed)
    random.seed(worker_seed)


# Copyright (c) OpenMMLab. All rights reserved.
# import platform
# from mmcv.utils import Registry, build_from_cfg

# from mmdet.datasets import DATASETS
# from mmdet.datasets.builder import _concat_dataset

# if platform.system() != 'Windows':
#     # https://github.com/pytorch/pytorch/issues/973
#     import resource
#     rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
#     base_soft_limit = rlimit[0]
#     hard_limit = rlimit[1]
#     soft_limit = min(max(4096, base_soft_limit), hard_limit)
#     resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))

# OBJECTSAMPLERS = Registry('Object sampler')


# def custom_build_dataset(cfg, default_args=None):
#     from mmdet3d.datasets.dataset_wrappers import CBGSDataset
#     from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
#                                                  ConcatDataset, RepeatDataset)
#     if isinstance(cfg, (list, tuple)):
#         dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
#     elif cfg['type'] == 'ConcatDataset':
#         dataset = ConcatDataset(
#             [custom_build_dataset(c, default_args) for c in cfg['datasets']],
#             cfg.get('separate_eval', True))
#     elif cfg['type'] == 'RepeatDataset':
#         dataset = RepeatDataset(
#             custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
#     elif cfg['type'] == 'ClassBalancedDataset':
#         dataset = ClassBalancedDataset(
#             custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
#     elif cfg['type'] == 'CBGSDataset':
#         dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
#     elif isinstance(cfg.get('ann_file'), (list, tuple)):
#         dataset = _concat_dataset(cfg, default_args)
#     else:
#         dataset = build_from_cfg(cfg, DATASETS, default_args)

#     return dataset

================================================
FILE: plugin/datasets/evaluation/AP.py
================================================
import numpy as np
from .distance import chamfer_distance, frechet_distance, chamfer_distance_batch
from typing import List, Tuple, Union
from numpy.typing import NDArray
import torch

def average_precision(recalls, precisions, mode='area'):
    """Calculate average precision. 

    Args:
        recalls (ndarray): shape (num_dets, )
        precisions (ndarray): shape (num_dets, )
        mode (str): 'area' or '11points', 'area' means calculating the area
            under precision-recall curve, '11points' means calculating
            the average precision of recalls at [0, 0.1, ..., 1]

    Returns:
        float: calculated average precision
    """

    recalls = recalls[np.newaxis, :]
    precisions = precisions[np.newaxis, :]

    assert recalls.shape == precisions.shape and recalls.ndim == 2
    num_scales = recalls.shape[0]
    ap = 0.

    if mode == 'area':
        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
        mrec = np.hstack((zeros, recalls, ones))
        mpre = np.hstack((zeros, precisions, zeros))
        for i in range(mpre.shape[1] - 1, 0, -1):
            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
        
        ind = np.where(mrec[0, 1:] != mrec[0, :-1])[0]
        ap = np.sum(
            (mrec[0, ind + 1] - mrec[0, ind]) * mpre[0, ind + 1])
    
    elif mode == '11points':
        for thr in np.arange(0, 1 + 1e-3, 0.1):
            precs = precisions[0, recalls[i, :] >= thr]
            prec = precs.max() if precs.size > 0 else 0
            ap += prec
        ap /= 11
    else:
        raise ValueError(
            'Unrecognized mode, only "area" and "11points" are supported')
    
    return ap

def instance_match(pred_lines: NDArray, 
                   scores: NDArray, 
                   gt_lines: NDArray, 
                   thresholds: Union[Tuple, List], 
                   metric: str='chamfer') -> List:
    """Compute whether detected lines are true positive or false positive.

    Args:
        pred_lines (array): Detected lines of a sample, of shape (M, INTERP_NUM, 2 or 3).
        scores (array): Confidence score of each line, of shape (M, ).
        gt_lines (array): GT lines of a sample, of shape (N, INTERP_NUM, 2 or 3).
        thresholds (list of tuple): List of thresholds.
        metric (str): Distance function for lines matching. Default: 'chamfer'.

    Returns:
        list_of_tp_fp (list): tp-fp matching result at all thresholds
    """

    if metric == 'chamfer':
        distance_fn = chamfer_distance

    elif metric == 'frechet':
        distance_fn = frechet_distance
    
    else:
        raise ValueError(f'unknown distance function {metric}')

    num_preds = pred_lines.shape[0]
    num_gts = gt_lines.shape[0]

    # tp and fp
    tp_fp_list = []
    tp = np.zeros((num_preds), dtype=np.float32)
    fp = np.zeros((num_preds), dtype=np.float32)

    # if there is no gt lines in this sample, then all pred lines are false positives
    if num_gts == 0:
        fp[...] = 1
        for thr in thresholds:
            tp_fp_list.append((tp.copy(), fp.copy()))
        return tp_fp_list
    
    if num_preds == 0:
        for thr in thresholds:
            tp_fp_list.append((tp.copy(), fp.copy()))
        return tp_fp_list

    assert pred_lines.shape[1] == gt_lines.shape[1], \
        "sample points num should be the same"

    # distance matrix: M x N
    matrix = np.zeros((num_preds, num_gts))

    # for i in range(num_preds):
    #     for j in range(num_gts):
    #         matrix[i, j] = distance_fn(pred_lines[i], gt_lines[j])
    
    matrix = chamfer_distance_batch(pred_lines, gt_lines)
    # for each det, the min distance with all gts
    matrix_min = matrix.min(axis=1)

    # for each det, which gt is the closest to it
    matrix_argmin = matrix.argmin(axis=1)
    # sort all dets in descending order by scores
    sort_inds = np.argsort(-scores)

    # match under different thresholds
    for thr in thresholds:
        tp = np.zeros((num_preds), dtype=np.float32)
        fp = np.zeros((num_preds), dtype=np.float32)

        gt_covered = np.zeros(num_gts, dtype=bool)
        for i in sort_inds:
            if matrix_min[i] <= thr:
                matched_gt = matrix_argmin[i]
                if not gt_covered[matched_gt]:
                    gt_covered[matched_gt] = True
                    tp[i] = 1
                else:
                    fp[i] = 1
            else:
                fp[i] = 1
        
        tp_fp_list.append((tp, fp))

    return tp_fp_list

================================================
FILE: plugin/datasets/evaluation/__init__.py
================================================


================================================
FILE: plugin/datasets/evaluation/distance.py
================================================
from scipy.spatial import distance
from numpy.typing import NDArray
import torch

def chamfer_distance(line1: NDArray, line2: NDArray) -> float:
    ''' Calculate chamfer distance between two lines. Make sure the 
    lines are interpolated.

    Args:
        line1 (array): coordinates of line1
        line2 (array): coordinates of line2
    
    Returns:
        distance (float): chamfer distance
    '''
    
    dist_matrix = distance.cdist(line1, line2, 'euclidean')
    dist12 = dist_matrix.min(-1).sum() / len(line1)
    dist21 = dist_matrix.min(-2).sum() / len(line2)

    return (dist12 + dist21) / 2

def frechet_distance(line1: NDArray, line2: NDArray) -> float:
    ''' Calculate frechet distance between two lines. Make sure the 
    lines are interpolated.

    Args:
        line1 (array): coordinates of line1
        line2 (array): coordinates of line2
    
    Returns:
        distance (float): frechet distance
    '''
    
    raise NotImplementedError

def chamfer_distance_batch(pred_lines, gt_lines):
    ''' Calculate chamfer distance between two group of lines. Make sure the 
    lines are interpolated.

    Args:
        pred_lines (array or tensor): shape (m, num_pts, 2 or 3)
        gt_lines (array or tensor): shape (n, num_pts, 2 or 3)
    
    Returns:
        distance (array): chamfer distance
    '''
    _, num_pts, coord_dims = pred_lines.shape
    
    if not isinstance(pred_lines, torch.Tensor):
        pred_lines = torch.tensor(pred_lines)
    if not isinstance(gt_lines, torch.Tensor):
        gt_lines = torch.tensor(gt_lines)
    dist_mat = torch.cdist(pred_lines.view(-1, coord_dims), 
                    gt_lines.view(-1, coord_dims), p=2) 
    # (num_query*num_points, num_gt*num_points)
    dist_mat = torch.stack(torch.split(dist_mat, num_pts)) 
    # (num_query, num_points, num_gt*num_points)
    dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) 
    # (num_gt, num_q, num_pts, num_pts)

    dist1 = dist_mat.min(-1)[0].sum(-1)
    dist2 = dist_mat.min(-2)[0].sum(-1)

    dist_matrix = (dist1 + dist2).transpose(0, 1) / (2 * num_pts)
    
    return dist_matrix.numpy()

================================================
FILE: plugin/datasets/evaluation/raster_eval.py
================================================
import torch
from mmdet3d.datasets import build_dataset, build_dataloader
import mmcv
from functools import cached_property
import prettytable
from numpy.typing import NDArray
from typing import Dict, Optional
from logging import Logger
from mmcv import Config
from copy import deepcopy

N_WORKERS = 16

class RasterEvaluate(object):
    """Evaluator for rasterized map.

    Args:
        dataset_cfg (Config): dataset cfg for gt
        n_workers (int): num workers to parallel
    """

    def __init__(self, dataset_cfg: Config, n_workers: int=N_WORKERS):
        self.dataset = build_dataset(dataset_cfg)
        self.dataloader = build_dataloader(
            self.dataset, samples_per_gpu=1, workers_per_gpu=n_workers, shuffle=False, dist=False)
        self.cat2id = self.dataset.cat2id
        self.id2cat = {v: k for k, v in self.cat2id.items()}
        self.n_workers = n_workers

    @cached_property
    def gts(self) -> Dict[str, NDArray]:
        print('collecting gts...')
        gts = {}
        for data in mmcv.track_iter_progress(self.dataloader):
            token = deepcopy(data['img_metas'].data[0][0]['token'])
            gt = deepcopy(data['semantic_mask'].data[0][0])
            gts[token] = gt
            del data # avoid dataloader memory crash
        
        return gts

    def evaluate(self, 
                 result_path: str, 
                 logger: Optional[Logger]=None) -> Dict[str, float]:
        ''' Do evaluation for a submission file and print evalution results to `logger` if specified.
        The submission will be aligned by tokens before evaluation. 
        
        Args:
            result_path (str): path to submission file
            logger (Logger): logger to print evaluation result, Default: None
        
        Returns:
            result_dict (Dict): evaluation results. IoU by categories.
        '''
        
        results = mmcv.load(result_path)
        meta = results['meta']
        results = results['results']

        result_dict = {}

        gts = []
        preds = []
        for token, gt in self.gts.items():
            gts.append(gt)
            pred = torch.zeros((len(self.cat2id), gt.shape[1], gt.shape[2])).bool()
            if token in results:
                semantic_mask = torch.tensor(results[token]['semantic_mask'])
                for label_i in range(gt.shape[0]):
                    pred[label_i] = (semantic_mask == label_i+1)
            preds.append(pred)
        
        preds = torch.stack(preds).bool()
        gts = torch.stack(gts).bool()

        # TODO: flip the gt
        gts = torch.flip(gts, [2,])

        # for every label
        total = 0
        for i in range(gts.shape[1]):
            category = self.id2cat[i]
            pred = preds[:, i]
            gt = gts[:, i]
            intersect = (pred & gt).sum().float().item()
            union = (pred | gt).sum().float().item()
            result_dict[category] = intersect / (union + 1e-7)
            total += result_dict[category]
        
        mIoU = total / gts.shape[1]
        result_dict['mIoU'] = mIoU
        
        categories = list(self.cat2id.keys())
        table = prettytable.PrettyTable([' ', *categories, 'mean'])
        table.add_row(['IoU', 
            *[round(result_dict[cat], 4) for cat in categories], 
            round(mIoU, 4)])
        
        if logger:
            from mmcv.utils import print_log
            print_log('\n'+str(table), logger=logger)
            print_log(f'mIoU = {mIoU:.4f}\n', logger=logger)

        return result_dict


================================================
FILE: plugin/datasets/evaluation/vector_eval.py
================================================
from functools import partial
import numpy as np
from multiprocessing import Pool
from mmdet3d.datasets import build_dataset, build_dataloader
import mmcv
from .AP import instance_match, average_precision
import prettytable
from time import time
from functools import cached_property
from shapely.geometry import LineString
from numpy.typing import NDArray
from typing import Dict, List, Optional
from logging import Logger
from mmcv import Config
from copy import deepcopy
import os

INTERP_NUM = 200 # number of points to interpolate during evaluation
THRESHOLDS = [0.5, 1.0, 1.5] # AP thresholds
N_WORKERS = 16 # num workers to parallel
SAMPLE_DIST = 0.15


class VectorEvaluate(object):
    """Evaluator for vectorized map.

    Args:
        dataset_cfg (Config): dataset cfg for gt
        n_workers (int): num workers to parallel
    """

    def __init__(self, dataset_cfg: Config, n_workers: int=N_WORKERS) -> None:
        self.dataset = build_dataset(dataset_cfg)
        self.cat2id = self.dataset.cat2id
        self.id2cat = {v: k for k, v in self.cat2id.items()}
        self.n_workers = n_workers
        self.new_split = 'newsplit' in self.dataset.ann_file
        self.roi_size = self.dataset.roi_size
        if self.roi_size == (60, 30):
            self.thresholds = [0.5, 1.0, 1.5]
        elif self.roi_size == (100, 50):
            self.thresholds = [1.0, 1.5, 2.0]
        
    @cached_property
    def gts(self) -> Dict[str, Dict[int, List[NDArray]]]:
        roi_size = self.dataset.roi_size
        if 'av2' in self.dataset.ann_file:
            dataset = 'av2'
        else:
            dataset = 'nusc'
        if self.new_split:
            tmp_file = f'./tmp_gts_{dataset}_{roi_size[0]}x{roi_size[1]}_newsplit.pkl'
        else:
            tmp_file = f'./tmp_gts_{dataset}_{roi_size[0]}x{roi_size[1]}.pkl'
        if os.path.exists(tmp_file):
            print(f'loading cached gts from {tmp_file}')
            gts = mmcv.load(tmp_file)
            return gts
        
        print('collecting gts...')
        gts = {}
        self.dataloader = build_dataloader(
            self.dataset, samples_per_gpu=1, workers_per_gpu=self.n_workers, shuffle=False, dist=False)
        pbar = mmcv.ProgressBar(len(self.dataloader))
        for data in self.dataloader:
            token = deepcopy(data['img_metas'].data[0][0]['token'])
            gt = deepcopy(data['vectors'].data[0][0])
            gts[token] = gt
            pbar.update()
            del data # avoid dataloader memory crash
        
        if not os.path.exists(tmp_file):
            print(f"saving gt to {tmp_file}")
            mmcv.dump(gts, tmp_file)
        return gts
    
    def interp_fixed_num(self, 
                         vector: NDArray, 
                         num_pts: int) -> NDArray:
        ''' Interpolate a polyline.
        
        Args:
            vector (array): line coordinates, shape (M, 2)
            num_pts (int): 
        
        Returns:
            sampled_points (array): interpolated coordinates
        '''
        line = LineString(vector)
        distances = np.linspace(0, line.length, num_pts)
        sampled_points = np.array([list(line.interpolate(distance).coords) 
            for distance in distances]).squeeze()
        
        return sampled_points
    
    def interp_fixed_dist(self, 
                          vector: NDArray,
                          sample_dist: float) -> NDArray:
        ''' Interpolate a line at fixed interval.
        
        Args:
            vector (LineString): vector
            sample_dist (float): sample interval
        
        Returns:
            points (array): interpolated points, shape (N, 2)
        '''
        line = LineString(vector)
        distances = list(np.arange(sample_dist, line.length, sample_dist))
        # make sure to sample at least two points when sample_dist > line.length
        distances = [0,] + distances + [line.length,] 
        
        sampled_points = np.array([list(line.interpolate(distance).coords)
                                for distance in distances]).squeeze()
        
        return sampled_points

    def _evaluate_single(self, 
                         pred_vectors: List, 
                         scores: List, 
                         groundtruth: List, 
                         thresholds: List, 
                         metric: str='metric') -> Dict[int, NDArray]:
        ''' Do single-frame matching for one class.
        
        Args:
            pred_vectors (List): List[vector(ndarray) (different length)], 
            scores (List): List[score(float)]
            groundtruth (List): List of vectors
            thresholds (List): List of thresholds
        
        Returns:
            tp_fp_score_by_thr (Dict): matching results at different thresholds
                e.g. {0.5: (M, 2), 1.0: (M, 2), 1.5: (M, 2)}
        '''

        pred_lines = []

        # interpolate predictions
        for vector in pred_vectors:
            vector = np.array(vector)
            vector_interp = self.interp_fixed_num(vector, INTERP_NUM)
            pred_lines.append(vector_interp)
        if pred_lines:
            pred_lines = np.stack(pred_lines)
        else:
            pred_lines = np.zeros((0, INTERP_NUM, 2))

        # interpolate groundtruth
        gt_lines = []
        for vector in groundtruth:
            vector_interp = self.interp_fixed_num(vector, INTERP_NUM)
            gt_lines.append(vector_interp)
        if gt_lines:
            gt_lines = np.stack(gt_lines)
        else:
            gt_lines = np.zeros((0, INTERP_NUM, 2))
        
        scores = np.array(scores)
        tp_fp_list = instance_match(pred_lines, scores, gt_lines, thresholds, metric) # (M, 2)
        tp_fp_score_by_thr = {}
        for i, thr in enumerate(thresholds):
            tp, fp = tp_fp_list[i]
            tp_fp_score = np.hstack([tp[:, None], fp[:, None], scores[:, None]])
            tp_fp_score_by_thr[thr] = tp_fp_score
        
        return tp_fp_score_by_thr # {0.5: (M, 2), 1.0: (M, 2), 1.5: (M, 2)}
        
    def evaluate(self, 
                 result_path: str, 
                 metric: str='chamfer', 
                 logger: Optional[Logger]=None) -> Dict[str, float]:
        ''' Do evaluation for a submission file and print evalution results to `logger` if specified.
        The submission will be aligned by tokens before evaluation. We use multi-worker to speed up.
        
        Args:
            result_path (str): path to submission file
            metric (str): distance metric. Default: 'chamfer'
            logger (Logger): logger to print evaluation result, Default: None
        
        Returns:
            new_result_dict (Dict): evaluation results. AP by categories.
        '''
        
        results = mmcv.load(result_path)
        results = results['results']
        
        # re-group samples and gt by label
        samples_by_cls = {label: [] for label in self.id2cat.keys()}
        num_gts = {label: 0 for label in self.id2cat.keys()}
        num_preds = {label: 0 for label in self.id2cat.keys()}

        # align by token
        for token, gt in self.gts.items():
            if token in results.keys():
                pred = results[token]
            else:
                pred = {'vectors': [], 'scores': [], 'labels': []}
            
            # for every sample
            vectors_by_cls = {label: [] for label in self.id2cat.keys()}
            scores_by_cls = {label: [] for label in self.id2cat.keys()}

            for i in range(len(pred['labels'])):
                # i-th pred line in sample
                label = pred['labels'][i]
                vector = pred['vectors'][i]
                score = pred['scores'][i]

                vectors_by_cls[label].append(vector)
                scores_by_cls[label].append(score)

            for label in self.id2cat.keys():
                new_sample = (vectors_by_cls[label], scores_by_cls[label], gt[label])
                num_gts[label] += len(gt[label])
                num_preds[label] += len(scores_by_cls[label])
                samples_by_cls[label].append(new_sample)

        result_dict = {}

        print(f'\nevaluating {len(self.id2cat)} categories...')
        start = time()
        if self.n_workers > 0:
            pool = Pool(self.n_workers)
        
        sum_mAP = 0
        pbar = mmcv.ProgressBar(len(self.id2cat))
        for label in self.id2cat.keys():
            samples = samples_by_cls[label] # List[(pred_lines, scores, gts)]
            result_dict[self.id2cat[label]] = {
                'num_gts': num_gts[label],
                'num_preds': num_preds[label]
            }
            sum_AP = 0

            fn = partial(self._evaluate_single, thresholds=self.thresholds, metric=metric)
            if self.n_workers > 0:
                tpfp_score_list = pool.starmap(fn, samples)
            else:
                tpfp_score_list = []
                for sample in samples:
                    tpfp_score_list.append(fn(*sample))
            
            for thr in self.thresholds:
                tp_fp_score = [i[thr] for i in tpfp_score_list]
                tp_fp_score = np.vstack(tp_fp_score) # (num_dets, 3)
                sort_inds = np.argsort(-tp_fp_score[:, -1])

                tp = tp_fp_score[sort_inds, 0] # (num_dets,)
                fp = tp_fp_score[sort_inds, 1] # (num_dets,)
                tp = np.cumsum(tp, axis=0)
                fp = np.cumsum(fp, axis=0)
                eps = np.finfo(np.float32).eps
                recalls = tp / np.maximum(num_gts[label], eps)
                precisions = tp / np.maximum((tp + fp), eps)

                AP = average_precision(recalls, precisions, 'area')
                sum_AP += AP
                result_dict[self.id2cat[label]].update({f'AP@{thr}': AP})

            pbar.update()
            
            AP = sum_AP / len(self.thresholds)
            sum_mAP += AP

            result_dict[self.id2cat[label]].update({f'AP': AP})
        
        if self.n_workers > 0:
            pool.close()
        
        mAP = sum_mAP / len(self.id2cat.keys())
        result_dict.update({'mAP': mAP})
        
        print(f"finished in {time() - start:.2f}s")

        # print results
        table = prettytable.PrettyTable(['category', 'num_preds', 'num_gts'] + 
                [f'AP@{thr}' for thr in self.thresholds] + ['AP'])
        for label in self.id2cat.keys():
            table.add_row([
                self.id2cat[label], 
                result_dict[self.id2cat[label]]['num_preds'],
                result_dict[self.id2cat[label]]['num_gts'],
                *[round(result_dict[self.id2cat[label]][f'AP@{thr}'], 4) for thr in self.thresholds],
                round(result_dict[self.id2cat[label]]['AP'], 4),
            ])
        
        from mmcv.utils import print_log
        print_log('\n'+str(table), logger=logger)
        mAP_normal = 0
        for label in self.id2cat.keys():
            for thr in self.thresholds:
                mAP_normal += result_dict[self.id2cat[label]][f'AP@{thr}']
        mAP_normal = mAP_normal / 9

        print_log(f'mAP_normal = {mAP_normal:.4f}\n', logger=logger)
        # print_log(f'mAP_hard = {mAP_easy:.4f}\n', logger=logger)

        new_result_dict = {}
        for name in self.cat2id:
            new_result_dict[name] = result_dict[name]['AP']

        return new_result_dict

================================================
FILE: plugin/datasets/map_utils/av2map_extractor.py
================================================
from av2.map.map_api import ArgoverseStaticMap
from pathlib import Path
from shapely.geometry import LineString, box, Polygon
from shapely import ops
import numpy as np
from .utils import split_collections, get_drivable_area_contour, \
        get_ped_crossing_contour, remove_repeated_lines, transform_from, \
        connect_lines, remove_boundary_dividers, remove_repeated_lanesegment, reassign_graph_attribute
from numpy.typing import NDArray
from typing import Dict, List, Tuple, Union

from av2.geometry.se3 import SE3
from nuscenes.map_expansion.map_api import NuScenesMapExplorer
import networkx as nx

from nuscenes.eval.common.utils import quaternion_yaw, Quaternion

from shapely.geometry import Polygon, LineString, box, MultiPolygon, MultiLineString
from shapely.strtree import STRtree

from shapely.geometry import CAP_STYLE, JOIN_STYLE


class AV2MapExtractor(object):
    """Argoverse 2 map ground-truth extractor.

    Args:
        roi_size (tuple or list): bev range
        id2map (dict): log id to map json path
    """
    def __init__(self, roi_size: Union[Tuple, List], id2map: Dict) -> None:
        self.roi_size = roi_size
        self.id2map = {}

        for log_id, path in id2map.items():
            self.id2map[log_id] = ArgoverseStaticMap.from_json(Path(path))
        
    def generate_nearby_dividers(self,avm, e2g_translation, e2g_rotation,patch):
        def get_path(ls_dict):
            pts_G = nx.DiGraph()
            junction_pts_list = []
            tmp=ls_dict
            for key, value in tmp.items():
                centerline_geom = LineString(value['polyline'].xyz)
                centerline_pts = np.array(centerline_geom.coords).round(3)
                start_pt = centerline_pts[0]
                end_pt = centerline_pts[-1]

                for idx, pts in enumerate(centerline_pts[:-1]):
                    pts_G.add_edge(tuple(centerline_pts[idx]),tuple(centerline_pts[idx+1]))

                valid_incoming_num = 0
                for idx, pred in enumerate(value['predecessors']):
                    if pred in tmp.keys():
                        valid_incoming_num += 1
                        pred_geom = LineString(tmp[pred]['polyline'].xyz)
                        pred_pt = np.array(pred_geom.coords).round(3)[-1]

                        if pred_pt[0] == start_pt[0] and pred_pt[1] == start_pt[1] and pred_pt[2] == start_pt[2]:
                            pass
                        else:
                            pts_G.add_edge(tuple(pred_pt), tuple(start_pt))

                if valid_incoming_num > 1:
                    junction_pts_list.append(tuple(start_pt))
                
                valid_outgoing_num = 0
                for idx, succ in enumerate(value['successors']):
                    if succ in tmp.keys():
                        valid_outgoing_num += 1
                        succ_geom = LineString(tmp[succ]['polyline'].xyz)
                        succ_pt = np.array(succ_geom.coords).round(3)[0]

                        if end_pt[0] == succ_pt[0] and end_pt[1] == succ_pt[1] and end_pt[2] == succ_pt[2]:
                            pass
                        else:
                            pts_G.add_edge(tuple(end_pt), tuple(succ_pt))

                if valid_outgoing_num > 1:
                    junction_pts_list.append(tuple(end_pt))
            
            roots = (v for v, d in pts_G.in_degree() if d == 0)
            roots_list = [v for v, d in pts_G.in_degree() if d == 0]
            
            notroot_list = [v for v in pts_G.nodes if v not in roots_list]
            leaves = [v for v,d in pts_G.out_degree() if d==0]
            ### find path from each root to leaves

            all_paths = []
            for root in roots:
                for leave in leaves:
                    paths = nx.all_simple_paths(pts_G, root, leave)
                    all_paths.extend(paths)

            for single_path in all_paths:
                for single_node in single_path:
                    if single_node in notroot_list:
                        notroot_list.remove(single_node)

            final_centerline_paths = []
            for path in all_paths:
                merged_line = LineString(path)
                # pdb.set_trace()
                merged_line = merged_line.simplify(0.2, preserve_topology=True)
                final_centerline_paths.append(merged_line)

            local_centerline_paths = final_centerline_paths
            return local_centerline_paths
        
        left_lane_dict = {}
        right_lane_dict = {}

        scene_ls_list = avm.get_scenario_lane_segments()
        scene_ls_dict = dict()
        for ls in scene_ls_list:
            scene_ls_dict[ls.id] = dict(
                ls=ls,
                polygon = Polygon(ls.polygon_boundary),
                predecessors=ls.predecessors,
                successors=ls.successors
            )
        
        nearby_ls_dict = dict()
        for key, value in scene_ls_dict.items():
            polygon = value['polygon']
            if polygon.is_valid:
                new_polygon = polygon.intersection(patch)
                if not new_polygon.is_empty:
                    nearby_ls_dict[key] = value['ls']

        ls_dict = nearby_ls_dict
        divider_ls_dict = dict()
        for key, value in ls_dict.items():
            if not value.is_intersection:
                divider_ls_dict[key] = value

        left_lane_dict = {}
        right_lane_dict = {}
        for key,value in divider_ls_dict.items():
            if value.left_neighbor_id is not None:
                left_lane_dict[key] = dict(
                    polyline=value.left_lane_boundary,
                    predecessors = value.predecessors,
                    successors = value.successors,
                    left_neighbor_id = value.left_neighbor_id,
                )
            if value.right_neighbor_id is not None:
                right_lane_dict[key] = dict(
                    polyline = value.right_lane_boundary,
                    predecessors = value.predecessors,
                    successors = value.successors,
                    right_neighbor_id = value.right_neighbor_id,
                )

        for key, value in left_lane_dict.items():
            if value['left_neighbor_id'] in right_lane_dict.keys():
                del right_lane_dict[value['left_neighbor_id']]

        for key, value in right_lane_dict.items():
            if value['right_neighbor_id'] in left_lane_dict.keys():
                del left_lane_dict[value['right_neighbor_id']]

        left_lane_dict = remove_repeated_lanesegment(left_lane_dict)
        right_lane_dict = remove_repeated_lanesegment(right_lane_dict)

        left_lane_dict = reassign_graph_attribute(left_lane_dict)
        right_lane_dict = reassign_graph_attribute(right_lane_dict)

        left_paths = get_path(left_lane_dict)
        right_paths = get_path(right_lane_dict)
        local_dividers = left_paths + right_paths

        return local_dividers

    def proc_polygon(self,polygon, ego_SE3_city):
        interiors = []
        exterior_cityframe = np.array(list(polygon.exterior.coords))
        exterior_egoframe = ego_SE3_city.transform_point_cloud(exterior_cityframe)
        for inter in polygon.interiors:
            inter_cityframe = np.array(list(inter.coords))
            inter_egoframe = ego_SE3_city.transform_point_cloud(inter_cityframe)
            interiors.append(inter_egoframe[:,:3])

        new_polygon = Polygon(exterior_egoframe[:,:3], interiors)
        return new_polygon
    
    def proc_line(self,line,ego_SE3_city):
        new_line_pts_cityframe = np.array(list(line.coords))
        new_line_pts_egoframe = ego_SE3_city.transform_point_cloud(new_line_pts_cityframe)
        line = LineString(new_line_pts_egoframe[:,:3]) #TODO
        return line

    def extract_local_divider(self,nearby_dividers, ego_SE3_city, patch_box, patch_angle,patch_size):
        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
        # pdb.set_trace()
        # final_pgeom = remove_repeated_lines(nearby_dividers)
        line_list = []
        # pdb.set_trace()
        for line in nearby_dividers:
            if line.is_empty:  # Skip lines without nodes.
                continue
            new_line = line.intersection(patch)
            if not new_line.is_empty:
                if new_line.geom_type == 'MultiLineString':
                    for single_line in new_line.geoms:
                        if single_line.is_empty:
                            continue
                        single_line = self.proc_line(single_line,ego_SE3_city)
                        line_list.append(single_line)
                else:
                    new_line = self.proc_line(new_line, ego_SE3_city)
                    line_list.append(new_line)
        centerlines = line_list
        
        poly_centerlines = [line.buffer(0.1,
                    cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) for line in centerlines]
        index_by_id = dict((id(pt), i) for i, pt in enumerate(poly_centerlines))
        tree = STRtree(poly_centerlines)
        final_pgeom = []
        remain_idx = [i for i in range(len(centerlines))]
        for i, pline in enumerate(poly_centerlines):
            if i not in remain_idx:
                continue
            remain_idx.pop(remain_idx.index(i))

            final_pgeom.append(centerlines[i])
            for o in tree.query(pline):
                o_idx = index_by_id[id(o)]
                if o_idx not in remain_idx:
                    continue
                inter = o.intersection(pline).area
                union = o.union(pline).area
                iou = inter / union
                if iou >= 0.90:
                    remain_idx.pop(remain_idx.index(o_idx))

        # return [np.array(line.coords) for line in final_pgeom]
        final_pgeom = connect_lines(final_pgeom)
        return final_pgeom

    def extract_local_boundary(self,avm, ego_SE3_city, patch_box, patch_angle,patch_size):
        boundary_list = []
        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
        for da in avm.get_scenario_vector_drivable_areas():
            boundary_list.append(da.xyz)

        polygon_list = []
        for da in boundary_list:
            exterior_coords = da
            interiors = []
        #     polygon = Polygon(exterior_coords, interiors)
            polygon = Polygon(exterior_coords, interiors)
            if polygon.is_valid:
                new_polygon = polygon.intersection(patch)
                if not new_polygon.is_empty:
                    if new_polygon.geom_type is 'Polygon':
                        if not new_polygon.is_valid:
                            continue
                        new_polygon = self.proc_polygon(new_polygon,ego_SE3_city)
                        if not new_polygon.is_valid:
                            continue
                    elif new_polygon.geom_type is 'MultiPolygon':
                        polygons = []
                        for single_polygon in new_polygon.geoms:
                            if not single_polygon.is_valid or single_polygon.is_empty:
                                continue
                            new_single_polygon = self.proc_polygon(single_polygon,ego_SE3_city)
                            if not new_single_polygon.is_valid:
                                continue
                            polygons.append(new_single_polygon)
                        if len(polygons) == 0:
                            continue
                        new_polygon = MultiPolygon(polygons)
                        if not new_polygon.is_valid:
                            continue
                    else:
                        raise ValueError('{} is not valid'.format(new_polygon.geom_type))

                    if new_polygon.geom_type is 'Polygon':
                        new_polygon = MultiPolygon([new_polygon])
                    polygon_list.append(new_polygon)

        union_segments = ops.unary_union(polygon_list)
        max_x = patch_size[1] / 2
        max_y = patch_size[0] / 2
        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
        exteriors = []
        interiors = []
        if union_segments.geom_type != 'MultiPolygon':
            union_segments = MultiPolygon([union_segments])
        for poly in union_segments.geoms:
            exteriors.append(poly.exterior)
            for inter in poly.interiors:
                interiors.append(inter)


        results = []
        for ext in exteriors:
            if ext.is_ccw:
                ext.coords = list(ext.coords)[::-1]
            lines = ext.intersection(local_patch)
            if isinstance(lines, MultiLineString):
                lines = ops.linemerge(lines)
            results.append(lines)

        for inter in interiors:
            if not inter.is_ccw:
                inter.coords = list(inter.coords)[::-1]
            lines = inter.intersection(local_patch)
            if isinstance(lines, MultiLineString):
                lines = ops.linemerge(lines)
            results.append(lines)

        boundary_lines = []
        for line in results:
            if not line.is_empty:
                if line.geom_type == 'MultiLineString':
                    for single_line in line.geoms:
                        boundary_lines.append(single_line)
                elif line.geom_type == 'LineString':
                    boundary_lines.append(line)
                else:
                    raise NotImplementedError
        return boundary_lines

    def get_scene_dividers(self,avm,patch_box,patch_angle):
        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
        scene_ls_list = avm.get_scenario_lane_segments()
        # pdb.set_trace()
        scene_ls_dict = dict()
        for ls in scene_ls_list:
            scene_ls_dict[ls.id] = dict(
                ls=ls,
                polygon = Polygon(ls.polygon_boundary),
                predecessors=ls.predecessors,
                successors=ls.successors
            )
        nearby_ls_dict = dict()
        for key, value in scene_ls_dict.items():
            polygon = value['polygon']
            if polygon.is_valid:
                new_polygon = polygon.intersection(patch)
                if not new_polygon.is_empty:
                    nearby_ls_dict[key] = value['ls']

        ls_dict = nearby_ls_dict
        divider_ls_dict = dict()
        for key, value in ls_dict.items():
            if not value.is_intersection:
                divider_ls_dict[key] = value

        return divider_ls_dict

    def get_scene_ped_crossings(self,avm,e2g_translation,e2g_rotation,polygon_ped=True):

        g2e_translation = e2g_rotation.T.dot(-e2g_translation)
        g2e_rotation = e2g_rotation.T

        roi_x, roi_y = self.roi_size[:2]
        local_patch = box(-roi_x / 2, -roi_y / 2, roi_x / 2, roi_y / 2)
        ped_crossings = [] 
        for _, pc in avm.vector_pedestrian_crossings.items():
            edge1_xyz = pc.edge1.xyz
            edge2_xyz = pc.edge2.xyz
            ego1_xyz = transform_from(edge1_xyz, g2e_translation, g2e_rotation)
            ego2_xyz = transform_from(edge2_xyz, g2e_translation, g2e_rotation)

            # if True, organize each ped crossing as closed polylines. 
            if polygon_ped:
                vertices = np.concatenate([ego1_xyz, ego2_xyz[::-1, :]])
                p = Polygon(vertices)
                line = get_ped_crossing_contour(p, local_patch)
                if line is not None:
                    if len(line.coords) < 3 or Polygon(line).area < 1:
                        continue
                    ped_crossings.append(line)
            # Otherwise organize each ped crossing as two parallel polylines.
            else:
                line1 = LineString(ego1_xyz)
                line2 = LineString(ego2_xyz)
                line1_local = line1.intersection(local_patch)
                line2_local = line2.intersection(local_patch)

                # take the whole ped cross if all two edges are in roi range
                if not line1_local.is_empty and not line2_local.is_empty:
                    ped_crossings.append(line1_local)
                    ped_crossings.append(line2_local)

        return ped_crossings
    
    def get_map_geom(self,
                     log_id: str, 
                     e2g_translation: NDArray, 
                     e2g_rotation: NDArray,
                     polygon_ped=True) -> Dict[str, List[Union[LineString, Polygon]]]:
        ''' Extract geometries given `log_id` and ego pose.
        
        Args:
            log_id (str): log id
            e2g_translation (array): ego2global translation, shape (3,)
            e2g_rotation (array): ego2global rotation matrix, shape (3, 3)
            polygon_ped: if True, organize each ped crossing as closed polylines. \
                Otherwise organize each ped crossing as two parallel polylines. \
                Default: True
        
        Returns:
            geometries (Dict): extracted geometries by category.
        '''

        avm = self.id2map[log_id]
        
        patch_h = self.roi_size[1]
        patch_w = self.roi_size[0]
        patch_size = (patch_h, patch_w)
        map_pose = e2g_translation[:2]
        rotation = Quaternion._from_matrix(e2g_rotation)
        patch_box = (map_pose[0], map_pose[1], patch_size[0], patch_size[1])
        patch_angle = quaternion_yaw(rotation) / np.pi * 180

        city_SE2_ego = SE3(e2g_rotation, e2g_translation)
        ego_SE3_city = city_SE2_ego.inverse()
        
        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
        nearby_dividers = self.generate_nearby_dividers(avm, e2g_translation,e2g_rotation,patch)
        # pdb.set_trace()
        map_anno=dict(
            divider=[],
            ped_crossing=[],
            boundary=[],
            drivable_area=[],
        )
        map_anno['ped_crossing'] = self.get_scene_ped_crossings(avm,e2g_translation,e2g_rotation,polygon_ped=polygon_ped)
        
        map_anno['boundary'] = self.extract_local_boundary(avm, ego_SE3_city, patch_box, patch_angle,patch_size)
        # map_anno['centerline'] = extract_local_centerline(nearby_centerlines, ego_SE3_city, patch_box, patch_angle,patch_size)
        all_dividers = self.extract_local_divider(nearby_dividers, ego_SE3_city, patch_box, patch_angle,patch_size)

        map_anno['divider'] = remove_boundary_dividers(all_dividers,map_anno['boundary'])

        ########
        return map_anno

================================================
FILE: plugin/datasets/map_utils/nuscmap_extractor.py
================================================
from shapely.geometry import LineString, box, Polygon
from shapely import ops, strtree

import numpy as np
from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
from nuscenes.eval.common.utils import quaternion_yaw
from pyquaternion import Quaternion
from .utils import split_collections, get_drivable_area_contour, get_ped_crossing_contour
from numpy.typing import NDArray
from typing import Dict, List, Tuple, Union

from shapely.geometry import Polygon, MultiPolygon, LineString, Point, box, MultiLineString
from shapely import affinity, ops
import networkx as nx


class NuscMapExtractor(object):
    """NuScenes map ground-truth extractor.

    Args:
        data_root (str): path to nuScenes dataset
        roi_size (tuple or list): bev range
    """
    def __init__(self, data_root: str, roi_size: Union[List, Tuple]) -> None:
        self.roi_size = roi_size
        self.MAPS = ['boston-seaport', 'singapore-hollandvillage',
                     'singapore-onenorth', 'singapore-queenstown']
        
        self.nusc_maps = {}
        self.map_explorer = {}
        for loc in self.MAPS:
            self.nusc_maps[loc] = NuScenesMap(
                dataroot=data_root, map_name=loc)
            self.map_explorer[loc] = CNuScenesMapExplorer(self.nusc_maps[loc])
    
    def get_map_geom(self, 
                     location: str, 
                     e2g_translation: Union[List, NDArray],
                     e2g_rotation: Union[List, NDArray]) -> Dict[str, List[Union[LineString, Polygon]]]:
        # Borrowed from MapTR's codebase to make sure data are the same
        # (center_x, center_y, len_y, len_x) in nuscenes format
        patch_size_ego_coord = (self.roi_size[1], self.roi_size[0])
        patch_size_lidar_coord = (self.roi_size[0], self.roi_size[1])

        vector_map_maptr = VectorizedLocalMap(self.nusc_maps[location], self.map_explorer[location],
                                patch_size_lidar_coord, patch_size_ego_coord, map_classes=['divider','ped_crossing','boundary'])
        map_annos = vector_map_maptr.gen_vectorized_samples(e2g_translation, e2g_rotation)
        
        return dict(
            divider=map_annos['divider'], # List[LineString]
            ped_crossing=map_annos['ped_crossing'], # List[LineString]
            boundary=map_annos['boundary'], # List[LineString]
            drivable_area=[], # List[Polygon],
        )


class VectorizedLocalMap(object):
    CLASS2LABEL = {
        'road_divider': 0,
        'lane_divider': 0,
        'ped_crossing': 1,
        'contours': 2,
        'others': -1
    }
    def __init__(self,
                 nusc_map,
                 map_explorer,
                 patch_size,
                 roi_size,
                 map_classes=['divider','ped_crossing','boundary','centerline'],
                 line_classes=['road_divider', 'lane_divider'],
                 ped_crossing_classes=['ped_crossing'],
                 contour_classes=['road_segment', 'lane'],
                 centerline_classes=['lane_connector','lane'],
                 use_simplify=True,
                 ):
        super().__init__()
        self.nusc_map = nusc_map
        self.map_explorer = map_explorer
        self.vec_classes = map_classes
        self.line_classes = line_classes
        self.ped_crossing_classes = ped_crossing_classes
        self.polygon_classes = contour_classes
        self.centerline_classes = centerline_classes
        self.patch_size = patch_size
        self.roi_size = roi_size
        self.local_patch = box(-self.roi_size[0] / 2, -self.roi_size[1] / 2, 
                self.roi_size[0] / 2, self.roi_size[1] / 2)


    def gen_vectorized_samples(self, lidar2global_translation, lidar2global_rotation):
        '''
        use lidar2global to get gt map layers
        '''
        
        map_pose = lidar2global_translation[:2]
        rotation = Quaternion(lidar2global_rotation)
        # import ipdb;ipdb.set_trace()
        patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
        
        patch_angle = quaternion_yaw(rotation) / np.pi * 180
        map_dict = {'divider':[],'ped_crossing':[],'boundary':[],'centerline':[]}
        vectors = []

        for vec_class in self.vec_classes:
            if vec_class == 'divider':
                line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes)
                line_instances_dict = self.line_geoms_to_instances(line_geom)     
                for line_type, instances in line_instances_dict.items():
                    for instance in instances:
                        instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False)
                        map_dict[vec_class].append(instance)
                        # vectors.append((instance, self.CLASS2LABEL.get(line_type, -1)))
            elif vec_class == 'ped_crossing':
                ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes)
                ped_instance_list = ped_geom['ped_crossing']
                #ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom)
                for instance in ped_instance_list:
                    # vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1)))
                    instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False)
                    map_dict[vec_class].append(instance)
            elif vec_class == 'boundary':
                polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes)
                poly_bound_list = self.poly_geoms_to_instances(polygon_geom)
                for instance in poly_bound_list:
                    # import ipdb;ipdb.set_trace()
                    instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False)
                    map_dict[vec_class].append(instance)
                    # vectors.append((contour, self.CLASS2LABEL.get('contours', -1)))
            elif vec_class =='centerline':
                centerline_geom = self.get_centerline_geom(patch_box, patch_angle, self.centerline_classes)
                centerline_list = self.centerline_geoms_to_instances(centerline_geom)
                for instance in centerline_list:
                    instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False)
                    map_dict[vec_class].append(instance)
            else:
                raise ValueError(f'WRONG vec_class: {vec_class}')
        return map_dict

    def get_centerline_geom(self, patch_box, patch_angle, layer_names):
        map_geom = {}
        for layer_name in layer_names:
            if layer_name in self.centerline_classes:
                return_token = False
                layer_centerline_dict = self.map_explorer._get_centerline(
                patch_box, patch_angle, layer_name, return_token=return_token)
                if len(layer_centerline_dict.keys()) == 0:
                    continue
                # import ipdb;ipdb.set_trace()
                map_geom.update(layer_centerline_dict)
        return map_geom

    def get_map_geom(self, patch_box, patch_angle, layer_names):
        map_geom = {}
        for layer_name in layer_names:
            if layer_name in self.line_classes:
                geoms = self.get_divider_line(patch_box, patch_angle, layer_name)
                # map_geom.append((layer_name, geoms))
                map_geom[layer_name] = geoms
            elif layer_name in self.polygon_classes:
                geoms = self.get_contour_line(patch_box, patch_angle, layer_name)
                # map_geom.append((layer_name, geoms))
                map_geom[layer_name] = geoms
            elif layer_name in self.ped_crossing_classes:
                geoms = self.get_ped_crossing_line_stmmapnet(patch_box, patch_angle)
                # map_geom.append((layer_name, geoms))
                map_geom[layer_name] = geoms
        return map_geom

    def get_divider_line(self,patch_box,patch_angle,layer_name):
        if layer_name not in self.map_explorer.map_api.non_geometric_line_layers:
            raise ValueError("{} is not a line layer".format(layer_name))

        if layer_name == 'traffic_light':
            return None

        patch_x = patch_box[0]
        patch_y = patch_box[1]

        patch = self.map_explorer.get_patch_coord(patch_box, patch_angle)

        line_list = []
        records = getattr(self.map_explorer.map_api, layer_name)
        for record in records:
            line = self.map_explorer.map_api.extract_line(record['line_token'])
            if line.is_empty:  # Skip lines without nodes.
                continue

            new_line = line.intersection(patch)
            if not new_line.is_empty:
                new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False)
                new_line = affinity.affine_transform(new_line,
                                                     [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
                line_list.append(new_line)

        return line_list

    def get_contour_line(self,patch_box,patch_angle,layer_name):
        if layer_name not in self.map_explorer.map_api.non_geometric_polygon_layers:
            raise ValueError('{} is not a polygonal layer'.format(layer_name))

        patch_x = patch_box[0]
        patch_y = patch_box[1]

        patch = self.map_explorer.get_patch_coord(patch_box, patch_angle)

        records = getattr(self.map_explorer.map_api, layer_name)

        polygon_list = []
        if layer_name == 'drivable_area':
            for record in records:
                polygons = [self.map_explorer.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]

                for polygon in polygons:
                    new_polygon = polygon.intersection(patch)
                    if not new_polygon.is_empty:
                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
                                                      origin=(patch_x, patch_y), use_radians=False)
                        new_polygon = affinity.affine_transform(new_polygon,
                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
                        if new_polygon.geom_type == 'Polygon':
                            new_polygon = MultiPolygon([new_polygon])
                        polygon_list.append(new_polygon)

        else:
            for record in records:
                polygon = self.map_explorer.map_api.extract_polygon(record['polygon_token'])

                if polygon.is_valid:
                    new_polygon = polygon.intersection(patch)
                    if not new_polygon.is_empty:
                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
                                                      origin=(patch_x, patch_y), use_radians=False)
                        new_polygon = affinity.affine_transform(new_polygon,
                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
                        if new_polygon.geom_type == 'Polygon':
                            new_polygon = MultiPolygon([new_polygon])
                        polygon_list.append(new_polygon)

        return polygon_list


    def get_ped_crossing_line(self, patch_box, patch_angle):
        patch_x = patch_box[0]
        patch_y = patch_box[1]

        patch = self.map_explorer.get_patch_coord(patch_box, patch_angle)
        polygon_list = []
        records = getattr(self.map_explorer.map_api, 'ped_crossing')
        # records = getattr(self.nusc_maps[location], 'ped_crossing')
        for record in records:
            polygon = self.map_explorer.map_api.extract_polygon(record['polygon_token'])
            if polygon.is_valid:
                new_polygon = polygon.intersection(patch)
                if not new_polygon.is_empty:
                    new_polygon = affinity.rotate(new_polygon, -patch_angle,
                                                      origin=(patch_x, patch_y), use_radians=False)
                    new_polygon = affinity.affine_transform(new_polygon,
                                                            [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
                    if new_polygon.geom_type == 'Polygon':
                        new_polygon = MultiPolygon([new_polygon])
                    polygon_list.append(new_polygon)

        return polygon_list
    
    def _union_ped_stmmapnet(self, ped_geoms: List[Polygon]) -> List[Polygon]:
        ''' merge close ped crossings.
        
        Args:
            ped_geoms (list): list of Polygon
        
        Returns:
            union_ped_geoms (Dict): merged ped crossings 
        '''

        ped_geoms = sorted(ped_geoms, key=lambda x:x.area, reverse=True)

        def get_rec_direction(geom):
            rect = geom.minimum_rotated_rectangle
            rect_v_p = np.array(rect.exterior.coords)[:3]
            rect_v = rect_v_p[1:]-rect_v_p[:-1]
            v_len = np.linalg.norm(rect_v, axis=-1)
            longest_v_i = v_len.argmax()

            return rect_v[longest_v_i], v_len[longest_v_i]

        tree = strtree.STRtree(ped_geoms)
        index_by_id = dict((id(pt), i) for i, pt in enumerate(ped_geoms))

        final_pgeom = []
        remain_idx = [i for i in range(len(ped_geoms))]
        for i, pgeom in enumerate(ped_geoms):

            if i not in remain_idx:
                continue
            # update
            remain_idx.pop(remain_idx.index(i))
            pgeom_v, pgeom_v_norm = get_rec_direction(pgeom)
            final_pgeom.append(pgeom)

            intersect_pgeom = tree.query(pgeom)
            intersect_pgeom = sorted(intersect_pgeom, key=lambda x:x.area, reverse=True)
            for o in intersect_pgeom:
                o_idx = index_by_id[id(o)]
                if o_idx not in remain_idx:
                    continue

                o_v, o_v_norm = get_rec_direction(o)
                cos = pgeom_v.dot(o_v)/(pgeom_v_norm*o_v_norm)

                o_pgeom_union = o.union(pgeom)
                ch_union = o_pgeom_union.convex_hull
                ch_area_ratio = o_pgeom_union.area / ch_union.area

                # add an extra criterion for merging here to handle patch-boundary-case
                if 1 - np.abs(cos) < 0.01 and ch_area_ratio > 0.8:  # theta < 8 degrees.
                    final_pgeom[-1] =\
                        final_pgeom[-1].union(o)
                    # update
                    remain_idx.pop(remain_idx.index(o_idx))
        
        final_pgeom = self._handle_small_peds(final_pgeom)

        results = []
        for p in final_pgeom:
            results.extend(split_collections(p))
        return results
    
    def _handle_small_peds(self, ped_geoms):
        def get_two_rec_directions(geom):
            rect = geom.minimum_rotated_rectangle
            rect_v_p = np.array(rect.exterior.coords)[:3]
            rect_v = rect_v_p[1:]-rect_v_p[:-1]
            v_len = np.linalg.norm(rect_v, axis=-1)
            return rect_v, v_len

        tree = strtree.STRtree(ped_geoms)
        index_by_id = dict((id(pt), i) for i, pt in enumerate(ped_geoms))

        final_pgeom = []
        remain_idx = [i for i in range(len(ped_geoms))]

        for i, pgeom in enumerate(ped_geoms):
            if i not in remain_idx:
                continue
            # update
            remain_idx.pop(remain_idx.index(i))
            final_pgeom.append(pgeom)

            pgeom_v, pgeom_v_norm = get_two_rec_directions(pgeom)
            
            intersect_pgeom = tree.query(pgeom)
            intersect_pgeom = sorted(intersect_pgeom, key=lambda x:x.area, reverse=True)
            for o in intersect_pgeom:
                o_idx = index_by_id[id(o)]
                if o_idx not in remain_idx:
                    continue

                if o.area >= pgeom.area:
                    continue

                o_pgeom_union = o.union(pgeom)
                o_v, o_v_norm = get_two_rec_directions(o_pgeom_union)

                ch_union = o_pgeom_union.convex_hull
                ch_area_ratio = o_pgeom_union.area / ch_union.area
                #mrr_union = o_pgeom_union.minimum_rotated_rectangle
                #mrr_area_ratio = o_pgeom_union.area / mrr_union.area

                cos_00 = pgeom_v[0].dot(o_v[0])/(pgeom_v_norm[0]*o_v_norm[0])
                cos_01 = pgeom_v[0].dot(o_v[1])/(pgeom_v_norm[0]*o_v_norm[1])
                cos_10 = pgeom_v[1].dot(o_v[0])/(pgeom_v_norm[1]*o_v_norm[0])
                cos_11 = pgeom_v[1].dot(o_v[1])/(pgeom_v_norm[1]*o_v_norm[1])
                cos_checks = np.array([(1 - np.abs(cos) < 0.001) for cos in [cos_00, cos_01, cos_10, cos_11]])
                # add an extra criterion for merging here to handle patch-boundary-case

                if cos_checks.sum() == 2 and ch_area_ratio > 0.8:
                    final_pgeom[-1] =\
                        final_pgeom[-1].union(o)
                    # update
                    remain_idx.pop(remain_idx.index(o_idx))

        return final_pgeom


    def get_ped_crossing_line_stmmapnet(self, patch_box, patch_angle):
        # get ped crossings
        ped_crossings = []
        ped = self.map_explorer._get_layer_polygon(
                    patch_box, patch_angle, 'ped_crossing')
                
        for p in ped:
            ped_crossings += split_collections(p)
        # some ped crossings are split into several small parts
        # we need to merge them
        ped_crossings = self._union_ped_stmmapnet(ped_crossings)

        # NOTE: clean-up noisy ped-crossing instances (for our cleaned training data only, maybe need to still
        # use the original version when evaluation...)
        # 1). filter too small ped_crossing merging results 
        #areas = [p.area for p in ped_crossings]
        #print('Ped areas\n', areas)
        updated_ped_crossings = []
        for p_idx, p in enumerate(ped_crossings):
            area = p.area
            if area < 1:
                continue
            elif area < 20:
                covered = False
                for other_idx, p_other in enumerate(ped_crossings):
                    if other_idx != p_idx and p.covered_by(p_other):
                        covered = True
                        break
                if not covered:
                    updated_ped_crossings.append(p)
            else:
                updated_ped_crossings.append(p)

        ped_crossing_lines = []
        for p in updated_ped_crossings:
            # extract exteriors to get a closed polyline                        
            line = get_ped_crossing_contour(p, self.local_patch)
            if line is not None:
                ped_crossing_lines.append(line)
    
        return ped_crossing_lines

    def line_geoms_to_instances(self, line_geom):
        line_instances_dict = dict()
        for line_type, a_type_of_lines in line_geom.items():
            one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines)
            line_instances_dict[line_type] = one_type_instances

        return line_instances_dict

    def _one_type_line_geom_to_instances(self, line_geom):
        line_instances = []
        
        for line in line_geom:
            if not line.is_empty:
                if line.geom_type == 'MultiLineString':
                    for single_line in line.geoms:
                        line_instances.append(single_line)
                elif line.geom_type == 'LineString':
                    line_instances.append(line)
                else:
                    raise NotImplementedError
        return line_instances

    def ped_poly_geoms_to_instances(self, ped_geom):
        # ped = ped_geom[0][1]
        # import ipdb;ipdb.set_trace()
        ped = ped_geom['ped_crossing']
        union_segments = ops.unary_union(ped)
        max_x = self.patch_size[1] / 2
        max_y = self.patch_size[0] / 2
        local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2)
        exteriors = []
        interiors = []
        if union_segments.geom_type != 'MultiPolygon':
            union_segments = MultiPolygon([union_segments])
        for poly in union_segments.geoms:
            exteriors.append(poly.exterior)
            for inter in poly.interiors:
                interiors.append(inter)

        results = []
        for ext in exteriors:
            if ext.is_ccw:
                ext.coords = list(ext.coords)[::-1]
            lines = ext.intersection(local_patch)
            if isinstance(lines, MultiLineString):
                lines = ops.linemerge(lines)
            results.append(lines)

        for inter in interiors:
            if not inter.is_ccw:
                inter.coords = list(inter.coords)[::-1]
            lines = inter.intersection(local_patch)
            if isinstance(lines, MultiLineString):
                lines = ops.linemerge(lines)
            results.append(lines)

        return self._one_type_line_geom_to_instances(results)


    def poly_geoms_to_instances(self, polygon_geom):
        roads = polygon_geom['road_segment']
        lanes = polygon_geom['lane']
        # import ipdb;ipdb.set_trace()
        union_roads = ops.unary_union(roads)
        union_lanes = ops.unary_union(lanes)
        union_segments = ops.unary_union([union_roads, union_lanes])
        max_x = self.patch_size[1] / 2
        max_y = self.patch_size[0] / 2
        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
        exteriors = []
        interiors = []
        if union_segments.geom_type != 'MultiPolygon':
            union_segments = MultiPolygon([union_segments])
        for poly in union_segments.geoms:
            exteriors.append(poly.exterior)
            for inter in poly.interiors:
                interiors.append(inter)

        results = []
        for ext in exteriors:
            if ext.is_ccw:
                ext.coords = list(ext.coords)[::-1]
            lines = ext.intersection(local_patch)
            if isinstance(lines, MultiLineString):
                lines = ops.linemerge(lines)
            results.append(lines)

        for inter in interiors:
            if not inter.is_ccw:
                inter.coords = list(inter.coords)[::-1]
            lines = inter.intersection(local_patch)
            if isinstance(lines, MultiLineString):
                lines = ops.linemerge(lines)
            results.append(lines)

        return self._one_type_line_geom_to_instances(results)

    def centerline_geoms_to_instances(self,geoms_dict):
        centerline_geoms_list,pts_G = self.union_centerline(geoms_dict)
        # vectors_dict = self.centerline_geoms2vec(centerline_geoms_list)
        # import ipdb;ipdb.set_trace()
        return self._one_type_line_geom_to_instances(centerline_geoms_list)


    def centerline_geoms2vec(self, centerline_geoms_list):
        vector_dict = {}
        # import ipdb;ipdb.set_trace()
        # centerline_geoms_list = [line.simplify(0.2, preserve_topology=True) \
        #                         for line in centerline_geoms_list]
        vectors = self._geom_to_vectors(
            centerline_geoms_list)
        vector_dict.update({'centerline': ('centerline', vectors)})
        return vector_dict

    def union_centerline(self, centerline_geoms):
        # import ipdb;ipdb.set_trace()
        pts_G = nx.DiGraph()
        junction_pts_list = []
        for key, value in centerline_geoms.items():
            centerline_geom = value['centerline']
            if centerline_geom.geom_type == 'MultiLineString':
                start_pt = np.array(centerline_geom.geoms[0].coords).round(3)[0]
                end_pt = np.array(centerline_geom.geoms[-1].coords).round(3)[-1]
                for single_geom in centerline_geom.geoms:
                    single_geom_pts = np.array(single_geom.coords).round(3)
                    for idx, pt in enumerate(single_geom_pts[:-1]):
                        pts_G.add_edge(tuple(single_geom_pts[idx]),tuple(single_geom_pts[idx+1]))
            elif centerline_geom.geom_type == 'LineString':
                centerline_pts = np.array(centerline_geom.coords).round(3)
                start_pt = centerline_pts[0]
                end_pt = centerline_pts[-1]
                for idx, pts in enumerate(centerline_pts[:-1]):
                    pts_G.add_edge(tuple(centerline_pts[idx]),tuple(centerline_pts[idx+1]))
            else:
                raise NotImplementedError
            valid_incoming_num = 0
            for idx, pred in enumerate(value['incoming_tokens']):
                if pred in centerline_geoms.keys():
                    valid_incoming_num += 1
                    pred_geom = centerline_geoms[pred]['centerline']
                    if pred_geom.geom_type == 'MultiLineString':
                        pred_pt = np.array(pred_geom.geoms[-1].coords).round(3)[-1]
        #                 if pred_pt != centerline_pts[0]:
                        pts_G.add_edge(tuple(pred_pt), tuple(start_pt))
                    else:
                        pred_pt = np.array(pred_geom.coords).round(3)[-1]
                        pts_G.add_edge(tuple(pred_pt), tuple(start_pt))
            if valid_incoming_num > 1:
                junction_pts_list.append(tuple(start_pt))
            
            valid_outgoing_num = 0
            for idx, succ in enumerate(value['outgoing_tokens']):
                if succ in centerline_geoms.keys():
                    valid_outgoing_num += 1
                    succ_geom = centerline_geoms[succ]['centerline']
                    if succ_geom.geom_type == 'MultiLineString':
                        succ_pt = np.array(succ_geom.geoms[0].coords).round(3)[0]
        #                 if pred_pt != centerline_pts[0]:
                        pts_G.add_edge(tuple(end_pt), tuple(succ_pt))
                    else:
                        succ_pt = np.array(succ_geom.coords).round(3)[0]
                        pts_G.add_edge(tuple(end_pt), tuple(succ_pt))
            if valid_outgoing_num > 1:
                junction_pts_list.append(tuple(end_pt))

        roots = (v for v, d in pts_G.in_degree() if d == 0)
        leaves = [v for v, d in pts_G.out_degree() if d == 0]
        all_paths = []
        for root in roots:
            paths = nx.all_simple_paths(pts_G, root, leaves)
            all_paths.extend(paths)

        final_centerline_paths = []
        for path in all_paths:
            merged_line = LineString(path)
            merged_line = merged_line.simplify(0.2, preserve_topology=True)
            final_centerline_paths.append(merged_line)
        return final_centerline_paths, pts_G


class CNuScenesMapExplorer(NuScenesMapExplorer):
    def __ini__(self, *args, **kwargs):
        super(self, CNuScenesMapExplorer).__init__(*args, **kwargs)

    def _get_centerline(self,
                           patch_box: Tuple[float, float, float, float],
                           patch_angle: float,
                           layer_name: str,
                           return_token: bool = False) -> dict:
        """
         Retrieve the centerline of a particular layer within the specified patch.
         :param patch_box: Patch box defined as [x_center, y_center, height, width].
         :param patch_angle: Patch orientation in degrees.
         :param layer_name: name of map layer to be extracted.
         :return: dict(token:record_dict, token:record_dict,...)
         """
        if layer_name not in ['lane','lane_connector']:
            raise ValueError('{} is not a centerline layer'.format(layer_name))

        patch_x = patch_box[0]
        patch_y = patch_box[1]

        patch = self.get_patch_coord(patch_box, patch_angle)

        records = getattr(self.map_api, layer_name)

        centerline_dict = dict()
        for record in records:
            if record['polygon_token'] is None:
                # import ipdb
                # ipdb.set_trace()
                continue
            polygon = self.map_api.extract_polygon(record['polygon_token'])

            # if polygon.intersects(patch) or polygon.within(patch):
            #     if not polygon.is_valid:
            #         print('within: {}, intersect: {}'.format(polygon.within(patch), polygon.intersects(patch)))
            #         print('polygon token {} is_valid: {}'.format(record['polygon_token'], polygon.is_valid))

            # polygon = polygon.buffer(0)

            if polygon.is_valid:
                # if within or intersect :

                new_polygon = polygon.intersection(patch)
                # new_polygon = polygon

                if not new_polygon.is_empty:
                    centerline = self.map_api.discretize_lanes(
                            record, 0.5)
                    centerline = list(self.map_api.discretize_lanes([record['token']], 0.5).values())[0]
                    centerline = LineString(np.array(centerline)[:,:2].round(3))
                    if centerline.is_empty:
                        continue
                    centerline = centerline.intersection(patch)
                    if not centerline.is_empty:
                        centerline = \
                            to_patch_coord(centerline, patch_angle, patch_x, patch_y)
                        
                        # centerline.coords = np.array(centerline.coords).round(3)
                        # if centerline.geom_type != 'LineString':
                            # import ipdb;ipdb.set_trace()
                        record_dict = dict(
                            centerline=centerline,
                            token=record['token'],
                            incoming_tokens=self.map_api.get_incoming_lane_ids(record['token']),
                            outgoing_tokens=self.map_api.get_outgoing_lane_ids(record['token']),
                        )
                        centerline_dict.update({record['token']: record_dict})
        return centerline_dict

def to_patch_coord(new_polygon, patch_angle, patch_x, patch_y):
    new_polygon = affinity.rotate(new_polygon, -patch_angle,
                                  origin=(patch_x, patch_y), use_radians=False)
    new_polygon = affinity.affine_transform(new_polygon,
                                            [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
    return new_polygon

================================================
FILE: plugin/datasets/map_utils/utils.py
================================================
from shapely.geometry import LineString, box, Polygon, LinearRing
from shapely.geometry.base import BaseGeometry
from shapely import ops
import numpy as np
from scipy.spatial import distance
from typing import List, Optional, Tuple
from numpy.typing import NDArray

def split_collections(geom: BaseGeometry) -> List[Optional[BaseGeometry]]:
    ''' Split Multi-geoms to list and check is valid or is empty.
        
    Args:
        geom (BaseGeometry): geoms to be split or validate.
    
    Returns:
        geometries (List): list of geometries.
    '''
    assert geom.geom_type in ['MultiLineString', 'LineString', 'MultiPolygon', 
        'Polygon', 'GeometryCollection'], f"got geom type {geom.geom_type}"
    if 'Multi' in geom.geom_type:
        outs = []
        for g in geom.geoms:
            if g.is_valid and not g.is_empty:
                outs.append(g)
        return outs
    else:
        if geom.is_valid and not geom.is_empty:
            return [geom,]
        else:
            return []

def get_drivable_area_contour(drivable_areas: List[Polygon], 
                              roi_size: Tuple) -> List[LineString]:
    ''' Extract drivable area contours to get list of boundaries.

    Args:
        drivable_areas (list): list of drivable areas.
        roi_size (tuple): bev range size
    
    Returns:
        boundaries (List): list of boundaries.
    '''
    max_x = roi_size[0] / 2
    max_y = roi_size[1] / 2

    # a bit smaller than roi to avoid unexpected boundaries on edges
    local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
    
    exteriors = []
    interiors = []
    
    for poly in drivable_areas:
        exteriors.append(poly.exterior)
        for inter in poly.interiors:
            interiors.append(inter)
    
    results = []
    for ext in exteriors:
        # NOTE: we make sure all exteriors are clock-wise
        # such that each boundary's right-hand-side is drivable area
        # and left-hand-side is walk way
        
        if ext.is_ccw:
            ext = LinearRing(list(ext.coords)[::-1])
        lines = ext.intersection(local_patch)
        if lines.geom_type == 'GeometryCollection' and len(lines) == 0:
            continue
        if lines.geom_type == 'MultiLineString':
            lines = ops.linemerge(lines)
        assert lines.geom_type in ['MultiLineString', 'LineString']
        
        results.extend(split_collections(lines))

    for inter in interiors:
        # NOTE: we make sure all interiors are counter-clock-wise
        if not inter.is_ccw:
            inter = LinearRing(list(inter.coords)[::-1])
        lines = inter.intersection(local_patch)
        if lines.geom_type == 'GeometryCollection' and len(lines) == 0:
            continue
        if lines.geom_type == 'MultiLineString':
            lines = ops.linemerge(lines)
        assert lines.geom_type in ['MultiLineString', 'LineString']
        
        results.extend(split_collections(lines))

    return results

def get_ped_crossing_contour(polygon: Polygon, 
                             local_patch: box) -> Optional[LineString]:
    ''' Extract ped crossing contours to get a closed polyline.
    Different from `get_drivable_area_contour`, this function ensures a closed polyline.

    Args:
        polygon (Polygon): ped crossing polygon to be extracted.
        local_patch (tuple): local patch params
    
    Returns:
        line (LineString): a closed line
    '''

    ext = polygon.exterior
    if not ext.is_ccw:
        ext = LinearRing(list(ext.coords)[::-1])
    lines = ext.intersection(local_patch)
    if lines.type != 'LineString':
        # remove points in intersection results
        lines = [l for l in lines.geoms if l.geom_type != 'Point']
        lines = ops.linemerge(lines)
        
        # same instance but not connected.
        if lines.type != 'LineString':
            ls = []
            for l in lines.geoms:
                ls.append(np.array(l.coords))
            
            lines = np.concatenate(ls, axis=0)
            lines = LineString(lines)

    if not lines.is_empty:
        start = list(lines.coords[0])
        end = list(lines.coords[-1])
        if not np.allclose(start, end, atol=1e-3):
            new_line = list(lines.coords)
            new_line.append(start)
            lines = LineString(new_line) # make ped cross closed
        return lines
    
    return None

def remove_repeated_lines(lines: List[LineString]) -> List[LineString]:
    ''' Remove repeated dividers since each divider in argoverse2 is mentioned twice
    by both left lane and right lane.

    Args:
        lines (List): list of dividers

    Returns:
        lines (List): list of left dividers
    '''

    new_lines = []
    for line in lines:
        repeated = False
        for l in new_lines:
            length = min(line.length, l.length)
            
            # hand-crafted rule to check overlap
            # if line.buffer(0.01).intersection(l.buffer(0.01)).area \
            #         > 0.2 * length:
            #     repeated = True
            #     break
            area1 = line.buffer(0.1)
            area2 = l.buffer(0.1)
            inter = area1.intersection(area2).area
            union = area1.union(area2).area
            iou = inter / union
            if iou >= 0.90:
                repeated = True
                break
        
        if not repeated:
            new_lines.append(line)
    
    return new_lines

def remove_repeated_lanesegment(lane_dict):
    ''' Remove repeated dividers since each divider in argoverse2 is mentioned twice
    by both left lane and right lane.

    Args:
        lines (List): list of dividers

    Returns:
        lines (List): list of left dividers
    '''

    new_lane_dict = {}
    # for line in lines:
    for key, value in lane_dict.items():
        repeated = False
        # for l in new_lines:
        for new_key, new_value in new_lane_dict.items():
            # length = min(line.length, l.length)
            line = LineString(value['polyline'].xyz)
            l = LineString(new_value['polyline'].xyz)
            
            area1 = line.buffer(0.01)
            area2 = l.buffer(0.01)
            inter = area1.intersection(area2).area
            union = area1.union(area2).area
            iou = inter / union
            if iou >= 0.90:
                repeated = True
                break
        
        if not repeated:
            new_lane_dict[key] = value
    
    return new_lane_dict


def reassign_graph_attribute(lane_dict):
    for key, value in lane_dict.items():
        if len(value['predecessors']) > 0:
            if value['predecessors'][0] not in lane_dict.keys() or value['predecessors'][0]==key:
                value['predecessors'] = []
            else:
                lane_dict[value['predecessors'][0]]['successors']  = [key]
    for key, value in lane_dict.items():
        if len(value['successors']) > 0:
            if value['successors'][0] not in lane_dict.keys() or value['successors'][0]==key:
                value['successors'] = []
            else:
                lane_dict[value['successors'][0]]['predecessors']  = [key]

    return lane_dict


def remove_boundary_dividers(dividers: List[LineString], 
                             boundaries: List[LineString]) -> List[LineString]:
    ''' Some dividers overlaps with boundaries in argoverse2 dataset so
    we need to remove these dividers.

    Args:
        dividers (list): list of dividers
        boundaries (list): list of boundaries

    Returns:
        left_dividers (list): list of left dividers
    '''

    for idx in range(len(dividers))[::-1]:
        divider = dividers[idx]
        
        for bound in boundaries:
            length = min(divider.length, bound.length)

            # hand-crafted rule to check overlap
            if divider.buffer(0.3).intersection(bound.buffer(0.3)).area \
                    > 0.2 * length:
                # the divider overlaps boundary
                dividers.pop(idx)
                break

    return dividers

def connect_lines(lines: List[LineString]) -> List[LineString]:
    ''' Some dividers are split into multiple small parts
    so we need to connect these lines.

    Args:
        dividers (list): list of dividers
        boundaries (list): list of boundaries

    Returns:
        left_dividers (list): list of left dividers
    '''

    new_lines = []
    eps = 0.1 # threshold to identify continuous lines
    while len(lines) > 1:
        line1 = lines[0]
        merged_flag = False
        for i, line2 in enumerate(lines[1:]):
            # hand-crafted rule
            begin1 = list(line1.coords)[0]
            end1 = list(line1.coords)[-1]
            begin2 = list(line2.coords)[0]
            end2 = list(line2.coords)[-1]

            dist_matrix = distance.cdist([begin1, end1], [begin2, end2])
            if dist_matrix[0, 0] < eps:
                coords = list(line2.coords)[::-1] + list(line1.coords)
            elif dist_matrix[0, 1] < eps:
                coords = list(line2.coords) + list(line1.coords)
            elif dist_matrix[1, 0] < eps:
                coords = list(line1.coords) + list(line2.coords)
            elif dist_matrix[1, 1] < eps:
                coords = list(line1.coords) + list(line2.coords)[::-1]
            else: continue

            new_line = LineString(coords)
            lines.pop(i + 1)
            lines[0] = new_line
            merged_flag = True
            break
        
        if merged_flag: continue

        new_lines.append(line1)
        lines.pop(0)

    if len(lines) == 1:
        new_lines.append(lines[0])

    return new_lines

def transform_from(xyz: NDArray, 
                   translation: NDArray, 
                   rotation: NDArray) -> NDArray:
    ''' Transform points between different coordinate system.

    Args:
        xyz (array): original point coordinates
        translation (array): translation
        rotation (array): rotation matrix

    Returns:
        left_dividers (list): list of left dividers
    '''
    
    new_xyz = xyz @ rotation.T + translation
    return new_xyz


================================================
FILE: plugin/datasets/nusc_dataset.py
================================================
from.base_dataset import BaseMapDataset
from .map_utils.nuscmap_extractor import NuscMapExtractor
from mmdet.datasets import DATASETS
import numpy as np
from .visualize.renderer import Renderer
import mmcv
from time import time
from pyquaternion import Quaternion
import pickle


@DATASETS.register_module()
class NuscDataset(BaseMapDataset):
    """NuScenes map dataset class.

    Args:
        ann_file (str): annotation file path
        cat2id (dict): category to class id
        roi_size (tuple): bev range
        eval_config (Config): evaluation config
        meta (dict): meta information
        pipeline (Config): data processing pipeline config
        interval (int): annotation load interval
        work_dir (str): path to work dir
        test_mode (bool): whether in test mode
    """
    
    def __init__(self, data_root, **kwargs):
        super().__init__(**kwargs)
        self.map_extractor = NuscMapExtractor(data_root, self.roi_size)
        self.renderer = Renderer(self.cat2id, self.roi_size, 'nusc')
    
    def load_annotations(self, ann_file):
        """Load annotations from ann_file.

        Args:
            ann_file (str): Path of the annotation file.

        Returns:
            list[dict]: List of annotations.
        """
        
        start_time = time()
        ann = mmcv.load(ann_file)
        samples = ann[::self.interval]
        
        print(f'collected {len(samples)} samples in {(time() - start_time):.2f}s')
        self.samples = samples
    
    def load_matching(self, matching_file):
        with open(matching_file, 'rb') as pf:
            data = pickle.load(pf)
        total_samples = 0
        for scene_name, info in data.items():
            total_samples += len(info['sample_ids'])
        assert total_samples == len(self.samples), 'Matching info not matched with data samples'
        self.matching_meta = data
        print(f'loaded matching meta for {len(data)} scenes')

    def get_sample(self, idx):
        """Get data sample. For each sample, map extractor will be applied to extract 
        map elements. 

        Args:
            idx (int): data index

        Returns:
            result (dict): dict of input
        """

        sample = self.samples[idx]
        location = sample['location']

        lidar2ego = np.eye(4)
        lidar2ego[:3,:3] = Quaternion(sample['lidar2ego_rotation']).rotation_matrix
        lidar2ego[:3, 3] = sample['lidar2ego_translation']

        ego2global = np.eye(4)
        ego2global[:3,:3] = Quaternion(sample['e2g_rotation']).rotation_matrix
        ego2global[:3, 3] = sample['e2g_translation']

        # NOTE: The original StreamMapNet uses the ego location to query the map,
        # to align with the lidar-centered setting in MapTR, we made some modifiactions 
        # here to switch to the lidar-center setting
        lidar2global = ego2global @ lidar2ego
        lidar2global_translation = list(lidar2global[:3, 3])
        lidar2global_translation = [float(x) for x in lidar2global_translation]
        lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)

        map_geoms = self.map_extractor.get_map_geom(location, lidar2global_translation, 
                lidar2global_rotation)
        
        lidar_shifted_e2g_translation = np.array(sample['e2g_translation'])
        lidar_shifted_e2g_translation[0] = lidar2global_translation[0]
        lidar_shifted_e2g_translation[1] = lidar2global_translation[1]
        lidar_shifted_e2g_translation = lidar_shifted_e2g_translation.tolist()
        e2g_rotation = sample['e2g_rotation']

        lidar2global = np.eye(4)
        lidar2global[:3,:3] = Quaternion(e2g_rotation).rotation_matrix
        lidar2global[:3, 3] = lidar_shifted_e2g_translation
        global2lidar = np.linalg.inv(lidar2global)
        
        ego2lidar = global2lidar  @ ego2global

        map_label2geom = {}
        for k, v in map_geoms.items():
            if k in self.cat2id.keys():
                map_label2geom[self.cat2id[k]] = v
        
        ego2img_rts = []
        ego2cam_rts = []
        for c in sample['cams'].values():
            extrinsic, intrinsic = np.array(
                c['extrinsics']), np.array(c['intrinsics'])

            # ego coord to cam coord
            #ego2cam_rt = extrinsic

            cam2ego_rt = np.linalg.inv(extrinsic)
            cam2lidar_rt = ego2lidar @ cam2ego_rt
            lidar2cam_rt = np.linalg.inv(cam2lidar_rt)
            ego2cam_rt = lidar2cam_rt

            viewpad = np.eye(4)
            viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic

            ego2img_rt = (viewpad @ ego2cam_rt)
            ego2cam_rts.append(ego2cam_rt)
            ego2img_rts.append(ego2img_rt)


        input_dict = {
            'location': location,
            'token': sample['token'],
            'img_filenames': [c['img_fpath'] for c in sample['cams'].values()],
            # intrinsics are 3x3 Ks
            'cam_intrinsics': [c['intrinsics'] for c in sample['cams'].values()],
            # extrinsics are 4x4 tranform matrix, **ego2cam**
            'cam_extrinsics': [c['extrinsics'] for c in sample['cams'].values()],
            'ego2img': ego2img_rts,
            'ego2cam': ego2cam_rts,
            'map_geoms': map_label2geom, # {0: List[ped_crossing(LineString)], 1: ...}
            #'ego2global_translation': sample['e2g_translation'], 
            #'ego2global_rotation': Quaternion(sample['e2g_rotation']).rotation_matrix.tolist(),
            'ego2global_translation': lidar_shifted_e2g_translation, 
            'ego2global_rotation': Quaternion(e2g_rotation).rotation_matrix.tolist(),
            'sample_idx': sample['sample_idx'],
            'scene_name': sample['scene_name'],
            'lidar2ego_translation': sample['lidar2ego_translation'],
            'lidar2ego_rotation': sample['lidar2ego_rotation'],
        }

        return input_dict

================================================
FILE: plugin/datasets/pipelines/__init__.py
================================================
from .loading import LoadMultiViewImagesFromFiles
from .formating import FormatBundleMap
from .transform import ResizeMultiViewImages, PadMultiViewImages, Normalize3D, PhotoMetricDistortionMultiViewImage
from .rasterize import RasterizeMap, PV_Map
from .vectorize import VectorizeMap

__all__ = [
    'LoadMultiViewImagesFromFiles',
    'FormatBundleMap', 'Normalize3D', 'ResizeMultiViewImages', 'PadMultiViewImages',
    'RasterizeMap', 'PV_Map', 'VectorizeMap', 'PhotoMetricDistortionMultiViewImage'
]

================================================
FILE: plugin/datasets/pipelines/formating.py
================================================
import numpy as np
from mmcv.parallel import DataContainer as DC

from mmdet3d.core.points import BasePoints
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import to_tensor

@PIPELINES.register_module()
class FormatBundleMap(object):
    """Format data for map tasks and then collect data for model input.

    These fields are formatted as follows.

    - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True)
    - semantic_mask (if exists): (1) to tensor, (2) to DataContainer (stack=True)
    - vectors (if exists): (1) to DataContainer (cpu_only=True)
    - img_metas: (1) to DataContainer (cpu_only=True)
    """

    def __init__(self, process_img=True, 
                keys=['img', 'semantic_mask', 'vectors'], 
                meta_keys=['intrinsics', 'extrinsics']):
        
        self.process_img = process_img
        self.keys = keys
        self.meta_keys = meta_keys

    def __call__(self, results):
        """Call function to transform and format common fields in results.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            dict: The result dict contains the data that is formatted with
                default bundle.
        """
        # Format 3D data
        if 'points' in results:
            assert isinstance(results['points'], BasePoints)
            results['points'] = DC(results['points'].tensor)

        for key in ['voxels', 'coors', 'voxel_centers', 'num_points']:
            if key not in results:
                continue
            results[key] = DC(to_tensor(results[key]), stack=False)

        if 'img' in results and self.process_img:
            if isinstance(results['img'], list):
                # process multiple imgs in single frame
                imgs = [img.transpose(2, 0, 1) for img in results['img']]
                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
                results['img'] = DC(to_tensor(imgs), stack=True)
            else:
                img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
                results['img'] = DC(to_tensor(img), stack=True)
        
        if 'semantic_mask' in results:
            #results['semantic_mask'] = DC(to_tensor(results['semantic_mask']), stack=True)
            if isinstance(results['semantic_mask'], np.ndarray):
                results['semantic_mask'] = DC(to_tensor(results['semantic_mask']), stack=True,
                                              pad_dims=None)
            else:
                assert isinstance(results['semantic_mask'], list)
                results['semantic_mask'] = DC(results['semantic_mask'], stack=False)

        if 'vectors' in results:
            # vectors may have different sizes
            vectors = results['vectors']
            results['vectors'] = DC(vectors, stack=False, cpu_only=True)
        
        if 'polys' in results:
            results['polys'] = DC(results['polys'], stack=False, cpu_only=True)
        
        return results

    def __repr__(self):
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(process_img={self.process_img}, '
        return repr_str


================================================
FILE: plugin/datasets/pipelines/loading.py
================================================
import mmcv
import numpy as np
from mmdet.datasets.builder import PIPELINES

@PIPELINES.register_module(force=True)
class LoadMultiViewImagesFromFiles(object):
    """Load multi channel images from a list of separate channel files.

    Expects results['img_filename'] to be a list of filenames.

    Args:
        to_float32 (bool): Whether to convert the img to float32.
            Defaults to False.
        color_type (str): Color type of the file. Defaults to 'unchanged'.
    """

    def __init__(self, to_float32=False, color_type='unchanged'):
        self.to_float32 = to_float32
        self.color_type = color_type

    def __call__(self, results):
        """Call function to load multi-view image from files.

        Args:
            results (dict): Result dict containing multi-view image filenames.

        Returns:
            dict: The result dict containing the multi-view image data. \
                Added keys and values are described below.

                - filename (str): Multi-view image filenames.
                - img (np.ndarray): Multi-view image arrays.
                - img_shape (tuple[int]): Shape of multi-view image arrays.
                - ori_shape (tuple[int]): Shape of original image arrays.
                - pad_shape (tuple[int]): Shape of padded image arrays.
                - scale_factor (float): Scale factor.
                - img_norm_cfg (dict): Normalization configuration of images.
        """
        filename = results['img_filenames']
        img = [mmcv.imread(name, self.color_type) for name in filename]
        if self.to_float32:
            img = [i.astype(np.float32) for i in img]
        results['img'] = img
        results['img_shape'] = [i.shape for i in img]
        results['ori_shape'] = [i.shape for i in img]
        # Set initial values for default meta_keys
        results['pad_shape'] = [i.shape for i in img]
        # results['scale_factor'] = 1.0
        num_channels = 1 if len(img[0].shape) < 3 else img[0].shape[2]
        results['img_norm_cfg'] = dict(
            mean=np.zeros(num_channels, dtype=np.float32),
            std=np.ones(num_channels, dtype=np.float32),
            to_rgb=False)
        results['img_fields'] = ['img']
        return results

    def __repr__(self):
        """str: Return a string that describes the module."""
        return f'{self.__class__.__name__} (to_float32={self.to_float32}, '\
            f"color_type='{self.color_type}')"


================================================
FILE: plugin/datasets/pipelines/rasterize.py
================================================
import numpy as np
from mmdet.datasets.builder import PIPELINES
from shapely.geometry import LineString, Polygon
from shapely import affinity
import cv2
from PIL import Image, ImageDraw
from numpy.typing import NDArray
from typing import List, Tuple, Union, Dict
import torch

import pdb

@PIPELINES.register_module(force=True)
class RasterizeMap(object):
    """Generate rasterized semantic map and put into 
    `semantic_mask` key.

    Args:
        roi_size (tuple or list): bev range
        canvas_size (tuple or list): bev feature size
        thickness (int): thickness of rasterized lines
        coords_dim (int): dimension of point coordinates
    """

    def __init__(self, 
                 roi_size: Union[Tuple, List], 
                 canvas_size: Union[Tuple, List], 
                 thickness: int, 
                 coords_dim: int,
                 semantic_mask=False,
                 ):

        self.roi_size = roi_size
        self.canvas_size = canvas_size
        self.scale_x = self.canvas_size[0] / self.roi_size[0]
        self.scale_y = self.canvas_size[1] / self.roi_size[1]
        self.thickness = thickness
        self.coords_dim = coords_dim
        self.semantic_mask = semantic_mask

    def line_ego_to_mask(self, 
                         line_ego: LineString, 
                         mask: NDArray, 
                         color: int=1, 
                         thickness: int=3,
                         fill_poly=False
                        ) -> None:
        # """Rasterize a single line to mask.
        # Args:
        #     line_ego (LineString): line
        #     mask (array): semantic mask to paint on
        #     color (int): positive label, default: 1
        #     thickness (int): thickness of rasterized lines, default: 3
        # """

        trans_x = self.canvas_size[0] / 2
        trans_y = self.canvas_size[1] / 2
        line_ego = affinity.scale(line_ego, self.scale_x, self.scale_y, origin=(0, 0))
        line_ego = affinity.affine_transform(line_ego, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
        
        coords = np.array(list(line_ego.coords), dtype=np.int32)[:, :2]
        coords = coords.reshape((-1, 2))
        assert len(coords) >= 2
        
        if fill_poly:
            cv2.fillPoly(mask, np.int32([coords]), color=color)
        else:
            cv2.polylines(mask, np.int32([coords]), False, color=color, thickness=thickness)

        
    def polygons_ego_to_mask(self, 
                             polygons: List[Polygon], 
                             color: int=1) -> NDArray:
        # ''' Rasterize a polygon to mask.
        
        # Args:
        #     polygons (list): list of polygons
        #     color (int): positive label, default: 1
        
        # Returns:
        #     mask (array): mask with rasterize polygons
        # '''

        #mask = Image.new("L", size=(self.canvas_size[0], self.canvas_size[1]), color=0) 
        # Image lib api expect size as (w, h)
        trans_x = self.canvas_size[0] / 2
        trans_y = self.canvas_size[1] / 2
        masks = []
        for polygon in polygons:
            mask = Image.new("L", size=(self.canvas_size[0], self.canvas_size[1]), color=0) 
            polygon = affinity.scale(polygon, self.scale_x, self.scale_y, origin=(0, 0))
            polygon = affinity.affine_transform(polygon, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
            ext = np.array(polygon.exterior.coords)[:, :2]
            vert_list = [(x, y) for x, y in ext]

            ImageDraw.Draw(mask).polygon(vert_list, outline=1, fill=color)
            masks.append(mask)

        #return np.array(mask, np.uint8)
        return masks
    
    def get_semantic_mask(self, map_geoms: Dict) -> NDArray:
        # ''' Rasterize all map geometries to semantic mask.
        
        # Args:
        #     map_geoms (dict): map geoms by class
        
        # Returns:
        #     semantic_mask (array): semantic mask
        # '''

        num_classes = len(map_geoms)
        if self.semantic_mask:
            semantic_mask = np.zeros((num_classes, self.canvas_size[1], self.canvas_size[0]), dtype=np.uint8)
        else:
            instance_masks = []

        for label, geom_list in map_geoms.items():
            if len(geom_list) == 0:
                continue
            if geom_list[0].geom_type == 'LineString':
                for line in geom_list:
                    if self.semantic_mask:
                        fill_poly = True if label == 0 else False
                        self.line_ego_to_mask(line, semantic_mask[label], color=1,
                                            thickness=self.thickness, fill_poly=fill_poly)
                    else:
                        canvas = np.zeros((self.canvas_size[1], self.canvas_size[0]), dtype=np.uint8)
                        self.line_ego_to_mask(line, canvas, color=1,
                            thickness=self.thickness, fill_poly=False)
                        instance_masks.append([canvas, label])
            elif geom_list[0].geom_type == 'Polygon':
                # drivable area 
                polygons = []
                for polygon in geom_list:
                    polygons.append(polygon)
                if self.semantic_mask:
                    semantic_mask[label] = self.polygons_ego_to_mask(polygons, color=1)
                else:
                    polygon_masks = self.polygons_ego_to_mask(polygons, color=1)
                    for mask in polygon_masks:
                        instance_masks.append([mask, label])
            else:
                raise ValueError('map geoms must be either LineString or Polygon!')
        
        if self.semantic_mask:
            semantic_mask = np.ascontiguousarray(semantic_mask)
            return semantic_mask
        else:
            return instance_masks

    def __call__(self, input_dict: Dict) -> Dict:
        map_geoms = input_dict['map_geoms'] # {0: List[ped_crossing: LineString], 1: ...}

        semantic_mask = self.get_semantic_mask(map_geoms)
        input_dict['semantic_mask'] = semantic_mask # (num_class, canvas_size[1], canvas_size[0])
        return input_dict
    
    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(roi_size={self.roi_size}, '
        repr_str += f'canvas_size={self.canvas_size}), '
        repr_str += f'thickness={self.thickness}), ' 
        repr_str += f'coords_dim={self.coords_dim})'

        return repr_str


@PIPELINES.register_module(force=True)
class PV_Map(object):
    """Generate rasterized semantic map and put into 
    `semantic_mask` key.

    Args:
        roi_size (tuple or list): bev range
        canvas_size (tuple or list): bev feature size
        thickness (int): thickness of rasterized lines
        coords_dim (int): dimension of point coordinates
    """

    def __init__(self,
                 img_shape: Union[Tuple, List], 
                 feat_down_sample: int,
                 thickness: int, 
                 coords_dim: int,
                 pv_mask=False,
                 num_cams=6,
                 num_coords=2
                 ):

        self.num_cams = num_cams
        self.num_coords = num_coords
        self.img_shape = img_shape
        self.feat_down_sample = feat_down_sample

        self.pv_scale_x = self.img_shape[0] // feat_down_sample
        self.pv_scale_y = self.img_shape[1] // feat_down_sample

        self.thickness = thickness
        self.coords_dim = coords_dim
        self.pv_mask = pv_mask
        
    def perspective(self,cam_coords, proj_mat):
        pix_coords = proj_mat @ cam_coords
        valid_idx = pix_coords[2, :] > 0
        pix_coords = pix_coords[:, valid_idx]
        pix_coords = pix_coords[:2, :] / (pix_coords[2, :] + 1e-7)
        pix_coords = pix_coords.transpose(1, 0)
        return pix_coords

    @staticmethod
    def get_valid_pix_coords(pix_coords):
        valid_idx = pix_coords[:, 2] > 0
        pix_coords = pix_coords[valid_idx, :]
        pix_coords = pix_coords[:, :2] / (pix_coords[:, 2:3] + 1e-7)
        return pix_coords

    def line_ego_to_pvmask(self,
                          line_ego, 
                          mask, 
                          lidar2feat,
                          color=1, 
                          thickness=1):

        distances = np.linspace(0, line_ego.length, 200)
        coords = np.array([np.array(line_ego.interpolate(distance).coords) for distance in distances]).reshape(-1, self.num_coords)
        if coords.shape[1] == 2:
            coords = np.concatenate((coords,np.zeros((coords.shape[0],1))),axis=1)
        
        pts_num = coords.shape[0]
        ones = np.ones((pts_num,1))
        lidar_coords = np.concatenate([coords,ones], axis=1).transpose(1,0)
        pix_coords = self.perspective(lidar_coords, lidar2feat) // self.feat_down_sample
        cv2.polylines(mask, np.int32([pix_coords]), False, color=color, thickness=thickness)
    
    def lines_ego_to_pv(self, lines_ego, pv_mask, ego2imgs, color=1, thickness=1):
        lines_coord = []
        for line_ego in lines_ego:
            distances = np.linspace(0, line_ego.length, 100)
            coords = np.array([np.array(line_ego.interpolate(distance).coords) for distance in distances]).reshape(-1, self.num_coords)
            if coords.shape[1] == 2:
                coords = np.concatenate((coords,np.zeros((coords.shape[0],1))),axis=1)
            pts_num = coords.shape[0]
            ones = np.ones((pts_num,1))
            lidar_coords = np.concatenate([coords,ones], axis=1)
            lines_coord.append(lidar_coords)
        lines_coord = torch.tensor(np.stack(lines_coord, axis=0))
        for cam_idx in range(len(ego2imgs)):
            ego2img_i = torch.tensor(ego2imgs[cam_idx])
            pers_lines_coord = torch.einsum('lk,ijk->ijl', ego2img_i, lines_coord)
            valid_lines_coord = [self.get_valid_pix_coords(pers_coord) for pers_coord in pers_lines_coord]
            valid_lines_coord = [x // self.feat_down_sample for x in valid_lines_coord if len(x) > 0]
            lines_to_draw = [x.numpy().astype(np.int32) for x in valid_lines_coord]
            cv2.polylines(pv_mask[cam_idx], lines_to_draw, False, color=color, thickness=thickness)
    
    def get_pvmask_old(self,map_geoms: Dict,ego2img: List, img_filenames: List) -> NDArray:
        # ''' Rasterize all map geometries to semantic mask.
    
        # Args:
        #     map_geoms (dict): map geoms by class
    
        # Returns:
        #     semantic_mask (array): semantic mask
        # '''
        num_classes = len(map_geoms)
        if self.pv_mask:
            gt_pv_mask = np.zeros((self.num_cams, num_classes, self.pv_scale_x, self.pv_scale_y), dtype=np.uint8)
        else:
            instance_masks = []

        for label, geom_list in map_geoms.items():
            if len(geom_list) == 0:
                continue
            if geom_list[0].geom_type == 'LineString':
                for line in geom_list:
                    for cam_index in range(self.num_cams):
                        self.line_ego_to_pvmask(line,gt_pv_mask[cam_index][label],ego2img[cam_index],color=1,thickness=self.thickness)
        if self.pv_mask:
             gt_pv_mask = np.ascontiguousarray(gt_pv_mask)
            ## Visualize to double-check the pv seg is correct
             #self.visualize_all_pv_masks(gt_pv_mask, img_filenames)
             #import pdb; pdb.set_trace()
             return gt_pv_mask
        else:
            return instance_masks

    def get_pvmask(self, map_geoms: Dict,ego2img: List, img_filenames: List) -> NDArray:
        # ''' Rasterize all map geometries to semantic mask.
        
        # Args:
        #     map_geoms (dict): map geoms by class
        
        # Returns:
        #     semantic_mask (array): semantic mask
        # '''
        num_classes = len(map_geoms)
        if self.pv_mask:
            gt_pv_mask = np.zeros((num_classes, self.num_cams, self.pv_scale_x, self.pv_scale_y), dtype=np.uint8)
        else:
            instance_masks = []

        for label, geom_list in map_geoms.items():
            if len(geom_list) == 0:
                continue
            self.lines_ego_to_pv(geom_list, gt_pv_mask[label], ego2img, color=1, thickness=self.thickness)

        gt_pv_mask = gt_pv_mask.transpose(1, 0, 2, 3)
        if self.pv_mask:
            gt_pv_mask = np.ascontiguousarray(gt_pv_mask)
            ## Visualize to double-check the pv seg is correct
            #self.visualize_all_pv_masks(gt_pv_mask, img_filenames)
            #import pdb; pdb.set_trace()
            return gt_pv_mask
        else:
            return instance_masks

    def __call__(self, input_dict: Dict) -> Dict:
        map_geoms = input_dict['map_geoms'] # {0: List[ped_crossing: LineString], 1: ...}
        pv_mask = self.get_pvmask(map_geoms, input_dict['ego2img'], input_dict['img_filenames'])
        input_dict['pv_mask'] =  pv_mask # (num_class, canvas_size[1], canvas_size[0])
        return input_dict
    
    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(roi_size={self.roi_size}, '
        repr_str += f'canvas_size={self.canvas_size}), '
        repr_str += f'thickness={self.thickness}), ' 
        repr_str += f'coords_dim={self.coords_dim})'

        return repr_str
    
    def visualize_all_pv_masks(self, gt_pv_mask, img_filenames):
        for cam_id in range(gt_pv_mask.shape[0]):
            viz_img = self._visualize_pv_mask(gt_pv_mask[cam_id])
            viz_img = viz_img.transpose(1, 2, 0)
            out_path = './check_pv_seg/viz_{}.png'.format(cam_id)
            out_raw_path = './check_pv_seg/viz_raw_{}.png'.format(cam_id)
            filepath = img_filenames[cam_id]
            pv_img = cv2.imread(filepath)
            #pv_img = cv2.resize(pv_img, (800, 480))
            #viz_mask = cv2.resize(viz_img, (800, 480))
            pv_img = cv2.resize(pv_img, (608, 608))
            viz_mask = cv2.resize(viz_img, (608, 608))
            mask = (viz_mask == 255).all(-1)[..., None]
            viz_img = pv_img * mask + viz_mask * (1-mask)
            cv2.imwrite(out_path, viz_img)
            cv2.imwrite(out_raw_path, pv_img)
    
    def _visualize_pv_mask(self, pv_mask):
        COLOR_MAPS_BGR = {
            # bgr colors
            1: (0, 0, 255),
            2: (0, 255, 0),
            0: (255, 0, 0),
        }
        num_classes, h, w = pv_mask.shape
        viz_img = np.ones((num_classes, h, w), dtype=np.uint8) * 255
        for label in range(num_classes):
            valid = (pv_mask[label] == 1)
            viz_img[:, valid] = np.array(COLOR_MAPS_BGR[label]).reshape(3, 1)

        return viz_img

================================================
FILE: plugin/datasets/pipelines/transform.py
================================================
import numpy as np
import mmcv

from mmdet.datasets.builder import PIPELINES
from numpy import random

@PIPELINES.register_module(force=True)
class Normalize3D(object):
    """Normalize the image.
    Added key is "img_norm_cfg".
    Args:
        mean (sequence): Mean values of 3 channels.
        std (sequence): Std values of 3 channels.
        to_rgb (bool): Whether to convert the image from BGR to RGB,
            default is true.
    """

    def __init__(self, mean, std, to_rgb=True):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb

    def __call__(self, results):
        """Call function to normalize images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Normalized results, 'img_norm_cfg' key is added into
                result dict.
        """
        for key in results.get('img_fields', ['img']):
            results[key] = [mmcv.imnormalize(
                img, self.mean, self.std, self.to_rgb) for img in results[key]]
        results['img_norm_cfg'] = dict(
            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
        return repr_str


@PIPELINES.register_module(force=True)
class PadMultiViewImages(object):
    """Pad multi-view images and change intrinsics
    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.
    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
    If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed.

    Args:
        size (tuple, optional): Fixed padding size, (h, w).
        size_divisor (int, optional): The divisor of padded size.
        pad_val (float, optional): Padding value, 0 by default.
        change_intrinsics (bool): whether to update intrinsics.
    """

    def __init__(self, size=None, size_divisor=None, pad_val=0, change_intrinsics=False):
        self.size = size
        self.size_divisor = size_divisor
        self.pad_val = pad_val
        # only one of size and size_divisor should be valid
        assert size is not None or size_divisor is not None
        assert size is None or size_divisor is None

        self.change_intrinsics = change_intrinsics

    def _pad_img(self, results):
        """Pad images according to ``self.size``."""
        original_shape = [img.shape for img in results['img']]

        for key in results.get('img_fields', ['img']):
            if self.size is not None:
                padded_img = [mmcv.impad(
                    img, shape=self.size, pad_val=self.pad_val) for img in results[key]]
            elif self.size_divisor is not None:
                padded_img = [mmcv.impad_to_multiple(
                    img, self.size_divisor, pad_val=self.pad_val) for img in results[key]]
            results[key] = padded_img

        if self.change_intrinsics:
            post_intrinsics, post_ego2imgs = [], []
            for img, oshape, cam_intrinsic, ego2img in zip(results['img'], \
                    original_shape, results['cam_intrinsics'], results['ego2img']):
                scaleW = img.shape[1] / oshape[1]
                scaleH = img.shape[0] / oshape[0]

                rot_resize_matrix = np.array([ 
                                        [scaleW, 0,      0,    0],
                                        [0,      scaleH, 0,    0],
                                        [0,      0,      1,    0],
                                        [0,      0,      0,    1]])
                post_intrinsic = rot_resize_matrix[:3, :3] @ cam_intrinsic
                post_ego2img = rot_resize_matrix @ ego2img
                post_intrinsics.append(post_intrinsic)
                post_ego2imgs.append(post_ego2img)
        
            results.update({
                'cam_intrinsics': post_intrinsics,
                'ego2img': post_ego2imgs,
            })


        results['img_shape'] = [img.shape for img in padded_img]
        results['img_fixed_size'] = self.size
        results['img_size_divisor'] = self.size_divisor

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'size_divisor={self.size_divisor}, '
        repr_str += f'pad_val={self.pad_val})'
        repr_str += f'change_intrinsics={self.change_intrinsics})'

        return repr_str


@PIPELINES.register_module(force=True)
class ResizeMultiViewImages(object):
    """Resize mulit-view images and change intrinsics
    If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed

    Args:
        size (tuple, optional): resize target size, (h, w).
        change_intrinsics (bool): whether to update intrinsics.
    """
    def __init__(self, size=None, scale=None, change_intrinsics=True):
        self.size = size
        self.scale = scale
        assert size is None or scale is None
        self.change_intrinsics = change_intrinsics

    def __call__(self, results:dict):

        new_imgs, post_intrinsics, post_ego2imgs = [], [], []

        for img,  cam_intrinsic, ego2img in zip(results['img'], \
                results['cam_intrinsics'], results['ego2img']):
            if self.scale is not None:
                h, w = img.shape[:2]
                target_h = int(h * self.scale)
                target_w = int(w * self.scale)
            else:
                target_h = self.size[0]
                target_w = self.size[1]
            
            tmp, scaleW, scaleH = mmcv.imresize(img,
                                                # NOTE: mmcv.imresize expect (w, h) shape
                                                (target_w, target_h),
                                                return_scale=True)
            new_imgs.append(tmp)

            rot_resize_matrix = np.array([
                [scaleW, 0,      0,    0],
                [0,      scaleH, 0,    0],
                [0,      0,      1,    0],
                [0,      0,      0,    1]])
            post_intrinsic = rot_resize_matrix[:3, :3] @ cam_intrinsic
            post_ego2img = rot_resize_matrix @ ego2img
            post_intrinsics.append(post_intrinsic)
            post_ego2imgs.append(post_ego2img)

        results['img'] = new_imgs
        results['img_shape'] = [img.shape for img in new_imgs]
        if self.change_intrinsics:
            results.update({
                'cam_intrinsics': post_intrinsics,
                'ego2img': post_ego2imgs,
            })

        return results
    
    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'change_intrinsics={self.change_intrinsics})'

        return repr_str
    

@PIPELINES.register_module()
class PhotoMetricDistortionMultiViewImage:
    """Apply photometric distortion to image sequentially, every transformation
    is applied with a probability of 0.5. The position of random contrast is in
    second or second to last.
    1. random brightness
    2. random contrast (mode 0)
    3. convert color from BGR to HSV
    4. random saturation
    5. random hue
    6. convert color from HSV to BGR
    7. random contrast (mode 1)
    8. randomly swap channels
    Args:
        brightness_delta (int): delta of brightness.
        contrast_range (tuple): range of contrast.
        saturation_range (tuple): range of saturation.
        hue_delta (int): delta of hue.
    """

    def __init__(self,
                 brightness_delta=32,
                 contrast_range=(0.5, 1.5),
                 saturation_range=(0.5, 1.5),
                 hue_delta=18):
        self.brightness_delta = brightness_delta
        self.contrast_lower, self.contrast_upper = contrast_range
        self.saturation_lower, self.saturation_upper = saturation_range
        self.hue_delta = hue_delta

    def __call__(self, results):
        """Call function to perform photometric distortion on images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Result dict with images distorted.
        """
        imgs = results['img']
        new_imgs = []
        for img in imgs:
            assert img.dtype == np.float32, \
                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
            # random brightness
            if random.randint(2):
                delta = random.uniform(-self.brightness_delta,
                                    self.brightness_delta)
                img += delta

            # mode == 0 --> do random contrast first
            # mode == 1 --> do random contrast last
            mode = random.randint(2)
            if mode == 1:
                if random.randint(2):
                    alpha = random.uniform(self.contrast_lower,
                                        self.contrast_upper)
                    img *= alpha

            # convert color from BGR to HSV
            img = mmcv.bgr2hsv(img)

            # random saturation
            if random.randint(2):
                img[..., 1] *= random.uniform(self.saturation_lower,
                                            self.saturation_upper)

            # random hue
            if random.randint(2):
                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
                img[..., 0][img[..., 0] > 360] -= 360
                img[..., 0][img[..., 0] < 0] += 360

            # convert color from HSV to BGR
            img = mmcv.hsv2bgr(img)

            # random contrast
            if mode == 0:
                if random.randint(2):
                    alpha = random.uniform(self.contrast_lower,
                                        self.contrast_upper)
                    img *= alpha

            # randomly swap channels
            # if random.randint(2):
            #     img = img[..., random.permutation(3)]
            new_imgs.append(img)
        results['img'] = new_imgs
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
        repr_str += 'contrast_range='
        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
        repr_str += 'saturation_range='
        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
        repr_str += f'hue_delta={self.hue_delta})'
        return repr_str

================================================
FILE: plugin/datasets/pipelines/vectorize.py
================================================
import numpy as np
from mmdet.datasets.builder import PIPELINES
from shapely.geometry import LineString
from numpy.typing import NDArray
from typing import List, Tuple, Union, Dict

@PIPELINES.register_module(force=True)
class VectorizeMap(object):
    """Generate vectoized map and put into `semantic_mask` key.
    Concretely, shapely geometry objects are converted into sample points (ndarray).
    We use args `sample_num`, `sample_dist`, `simplify` to specify sampling method.

    Args:
        roi_size (tuple or list): bev range .
        normalize (bool): whether to normalize points to range (0, 1).
        coords_dim (int): dimension of point coordinates.
        simplify (bool): whether to use simpily function. If true, `sample_num` \
            and `sample_dist` will be ignored.
        sample_num (int): number of points to interpolate from a polyline. Set to -1 to ignore.
        sample_dist (float): interpolate distance. Set to -1 to ignore.
    """

    def __init__(self, 
                 roi_size: Union[Tuple, List], 
                 normalize: bool,
                 coords_dim: int,
                 simplify: bool=False, 
                 sample_num: int=-1, 
                 sample_dist: float=-1, 
                 permute: bool=False
        ):
        self.coords_dim = coords_dim
        self.sample_num = sample_num
        self.sample_dist = sample_dist
        self.roi_size = np.array(roi_size)
        self.normalize = normalize
        self.simplify = simplify
        self.permute = permute

        if sample_dist > 0:
            assert sample_num < 0 and not simplify
            self.sample_fn = self.interp_fixed_dist
        elif sample_num > 0:
            assert sample_dist < 0 and not simplify
            self.sample_fn = self.interp_fixed_num
        else:
            assert simplify

    def interp_fixed_num(self, line: LineString) -> NDArray:
        ''' Interpolate a line to fixed number of points.
        
        Args:
            line (LineString): line
        
        Returns:
            points (array): interpolated points, shape (N, 2)
        '''

        distances = np.linspace(0, line.length, self.sample_num)
        sampled_points = np.array([list(line.interpolate(distance).coords) 
            for distance in distances]).squeeze()

        return sampled_points

    def interp_fixed_dist(self, line: LineString) -> NDArray:
        ''' Interpolate a line at fixed interval.
        
        Args:
            line (LineString): line
        
        Returns:
            points (array): interpolated points, shape (N, 2)
        '''

        distances = list(np.arange(self.sample_dist, line.length, self.sample_dist))
        # make sure to sample at least two points when sample_dist > line.length
        distances = [0,] + distances + [line.length,] 
        
        sampled_points = np.array([list(line.interpolate(distance).coords)
                                for distance in distances]).squeeze()
        
        return sampled_points
    
    def get_vectorized_lines(self, map_geoms: Dict) -> Dict:
        ''' Vectorize map elements. Iterate over the input dict and apply the 
        specified sample funcion.
        
        Args:
            line (LineString): line
        
        Returns:
            vectors (array): dict of vectorized map elements.
        '''

        vectors = {}
        for label, geom_list in map_geoms.items():
            vectors[label] = []
            for geom in geom_list:
                if geom.geom_type == 'LineString':
                    if self.simplify:
                        line = geom.simplify(0.2, preserve_topology=True)
                        line = np.array(line.coords)
                    else:
                        line = self.sample_fn(geom)
                    line = line[:, :self.coords_dim]

                    if self.normalize:
                        line = self.normalize_line(line)
                    if self.permute:
                        line = self.permute_line(line)
                    vectors[label].append(line)

                elif geom.geom_type == 'Polygon':
                    # polygon objects will not be vectorized
                    continue
                
                else:
                    raise ValueError('map geoms must be either LineString or Polygon!')
        return vectors
    
    def normalize_line(self, line: NDArray) -> NDArray:
        ''' Convert points to range (0, 1).
        
        Args:
            line (LineString): line
        
        Returns:
            normalized (array): normalized points.
        '''

        origin = -np.array([self.roi_size[0]/2, self.roi_size[1]/2])

        line[:, :2] = line[:, :2] - origin

        # transform from range [0, 1] to (0, 1)
        eps = 1e-5
        line[:, :2] = line[:, :2] / (self.roi_size + eps)

        return line
    
    def permute_line(self, line: np.ndarray, padding=1e5):
        '''
        (num_pts, 2) -> (num_permute, num_pts, 2)
        where num_permute = 2 * (num_pts - 1)
        '''
        is_closed = np.allclose(line[0], line[-1], atol=1e-3)
        num_points = len(line)
        permute_num = num_points - 1
        permute_lines_list = []
        if is_closed:
            pts_to_permute = line[:-1, :] # throw away replicate start end pts
            for shift_i in range(permute_num):
                permute_lines_list.append(np.roll(pts_to_permute, shift_i, axis=0))
            flip_pts_to_permute = np.flip(pts_to_permute, axis=0)
            for shift_i in range(permute_num):
                permute_lines_list.append(np.roll(flip_pts_to_permute, shift_i, axis=0))
        else:
            permute_lines_list.append(line)
            permute_lines_list.append(np.flip(line, axis=0))

        permute_lines_array = np.stack(permute_lines_list, axis=0)

        if is_closed:
            tmp = np.zeros((permute_num * 2, num_points, self.coords_dim))
            tmp[:, :-1, :] = permute_lines_array
            tmp[:, -1, :] = permute_lines_array[:, 0, :] # add replicate start end pts
            permute_lines_array = tmp

        else:
            # padding
            padding = np.full([permute_num * 2 - 2, num_points, self.coords_dim], padding)
            permute_lines_array = np.concatenate((permute_lines_array, padding), axis=0)
        
        return permute_lines_array
    
    def __call__(self, input_dict):
        map_geoms = input_dict['map_geoms']

        input_dict['vectors'] = self.get_vectorized_lines(map_geoms)
        return input_dict

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(simplify={self.simplify}, '
        repr_str += f'sample_num={self.sample_num}), '
        repr_str += f'sample_dist={self.sample_dist}), ' 
        repr_str += f'roi_size={self.roi_size})'
        repr_str += f'normalize={self.normalize})'
        repr_str += f'coords_dim={self.coords_dim})'

        return repr_str

================================================
FILE: plugin/datasets/samplers/__init__.py
================================================
from .group_sampler import DistributedGroupSampler, InfiniteGroupEachSampleInBatchSampler
from .distributed_sampler import DistributedSampler
from .sampler import SAMPLER, build_sampler


================================================
FILE: plugin/datasets/samplers/distributed_sampler.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
#  Modified by Shihao Wang
# ---------------------------------------------
import math
import torch
from torch.utils.data import DistributedSampler as _DistributedSampler
from .sampler import SAMPLER
import numpy as np

@SAMPLER.register_module()
class DistributedSampler(_DistributedSampler):

    def __init__(self,
                 dataset=None,
                 num_replicas=None,
                 rank=None,
                 shuffle=True,
                 seed=0):
        super().__init__(
            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
        # for the compatibility from PyTorch 1.3+
        self.seed = seed if seed is not None else 0
        self.flag = self.dataset.flag
        self.group_sizes = np.bincount(self.flag)
        self.groups_num = len(self.group_sizes)
        self.groups = list(set(self.flag))
        assert self.groups == list(range(self.groups_num))

        # Now, for efficiency, make a dict {group_idx: List[dataset sample_idxs]}
        self.group_idx_to_sample_idxs = {
            group_idx: np.where(self.flag == group_idx)[0].tolist()
            for group_idx in range(self.groups_num)}  

        num_groups_per_gpu = math.ceil(len(self.groups) / self.num_replicas)
        # assign groups (continuous videos) to each gpu rank
        # self.sample_group_idx = self.groups[self.rank*num_groups_per_gpu: min(len(self.groups), (self.rank+1)*num_groups_per_gpu)]
        self.sample_group_idx = self.groups[self.rank::self.num_replicas]
        
        
        self.sample_idxs = []
        for i in self.sample_group_idx:
            self.sample_idxs.extend(self.group_idx_to_sample_idxs[i])

        #print('Rank', rank, 'Num samples', len(self.sample_idxs), 'Samples', self.sample_idxs)
        self.num_samples = len(self.sample_idxs)
        self.total_size = len(self.dataset)

    def __iter__(self):
        # only used for validation/testing 
        # only support batchsize = 1
        if self.shuffle:
            assert False
        # else:
        #     indices = torch.arange(len(self.dataset)).tolist()

        # # add extra samples to make it evenly divisible
        # # in case that indices is shorter than half of total_size
        # indices = (indices *
        #            math.ceil(self.total_size / len(indices)))[:self.total_size]
        # assert len(indices) == self.total_size

        # # subsample
        # per_replicas = self.total_size//self.num_replicas
        # # indices = indices[self.rank:self.total_size:self.num_replicas]
        # indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
        # assert len(indices) == self.num_samples

        return iter(self.sample_idxs)


================================================
FILE: plugin/datasets/samplers/group_sampler.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
#  Modified by Shihao Wang
# ---------------------------------------------
import math
import itertools
import copy
import torch.distributed as dist
import numpy as np
import torch
from mmcv.runner import get_dist_info
from torch.utils.data import Sampler
from .sampler import SAMPLER
import random

class GroupSampler(Sampler):

    def __init__(self, dataset, samples_per_gpu=1):
        assert hasattr(dataset, 'flag')
        self.dataset = dataset
        self.samples_per_gpu = samples_per_gpu
        self.flag = dataset.flag.astype(np.int64)
        self.group_sizes = np.bincount(self.flag)
        self.num_samples = 0
        for i, size in enumerate(self.group_sizes):
            self.num_samples += int(np.ceil(
                size / self.samples_per_gpu)) * self.samples_per_gpu
        
        print('Warning!!! Only used for testing!')

    def __iter__(self):
        for i, size in enumerate(self.group_sizes):
            if size == 0:
                continue
            indice = np.where(self.flag == i)[0]
            assert len(indice) == size
            yield from indice
            
    def __len__(self):
        return self.num_samples

@SAMPLER.register_module()
class DistributedGroupSampler(Sampler):
    """Sampler that restricts data loading to a subset of the dataset.
    It is especially useful in conjunction with
    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
    process can pass a DistributedSampler instance as a DataLoader sampler,
    and load a subset of the original dataset that is exclusive to it.
    .. note::
        Dataset is assumed to be of constant size.
    Arguments:
        dataset: Dataset used for sampling.
        num_replicas (optional): Number of processes participating in
            distributed training.
        rank (optional): Rank of the current process within num_replicas.
        seed (int, optional): random seed used to shuffle the sampler if
            ``shuffle=True``. This number should be identical across all
            processes in the distributed group. Default: 0.
    """

    def __init__(self,
                 dataset,
                 samples_per_gpu=1,
                 num_replicas=None,
                 rank=None,
                 seed=0):
        _rank, _num_replicas = get_dist_info()
        if num_replicas is None:
            num_replicas = _num_replicas
        if rank is None:
            rank = _rank
        self.dataset = dataset
        self.samples_per_gpu = samples_per_gpu
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.seed = seed if seed is not None else 0

        assert hasattr(self.dataset, 'flag')
        self.flag = self.dataset.flag
        self.group_sizes = np.bincount(self.flag)

        self.num_samples = 0
        for i, j in enumerate(self.group_sizes):
            self.num_samples += int(
                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
                          self.num_replicas)) * self.samples_per_gpu
        self.total_size = self.num_samples * self.num_replicas

    def __iter__(self):
        # deterministically shuffle based on epoch
        g = torch.Generator()
        g.manual_seed(self.epoch + self.seed)

        indices = []
        for i, size in enumerate(self.group_sizes):
            if size > 0:
                indice = np.where(self.flag == i)[0]
                assert len(indice) == size
                # add .numpy() to avoid bug when selecting indice in parrots.
                # TODO: check whether torch.randperm() can be replaced by
                # numpy.random.permutation().
                indice = indice[list(
                    torch.randperm(int(size), generator=g).numpy())].tolist()
                extra = int(
                    math.ceil(
                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
                ) * self.samples_per_gpu * self.num_replicas - len(indice)
                # pad indice
                tmp = indice.copy()
                for _ in range(extra // size):
                    indice.extend(tmp)
                indice.extend(tmp[:extra % size])
                indices.extend(indice)

        assert len(indices) == self.total_size

        indices = [
            indices[j] for i in list(
                torch.randperm(
                    len(indices) // self.samples_per_gpu, generator=g))
            for j in range(i * self.samples_per_gpu, (i + 1) *
                           self.samples_per_gpu)
        ]

        # subsample
        offset = self.num_samples * self.rank
        indices = indices[offset:offset + self.num_samples]
        assert len(indices) == self.num_samples
        
        return iter(indices)

    def __len__(self):
        return self.num_samples

    def set_epoch(self, epoch):
        self.epoch = epoch


def sync_random_seed(seed=None, device='cuda'):
    """Make sure different ranks share the same seed.
    All workers must call this function, otherwise it will deadlock.
    This method is generally used in `DistributedSampler`,
    because the seed should be identical across all processes
    in the distributed group.
    In distributed sampling, different ranks should sample non-overlapped
    data in the dataset. Therefore, this function is used to make sure that
    each rank shuffles the data indices in the same order based
    on the same seed. Then different ranks could use different indices
    to select non-overlapped data from the same data list.
    Args:
        seed (int, Optional): The seed. Default to None.
        device (str): The device where the seed will be put on.
            Default to 'cuda'.
    Returns:
        int: Seed to be used.
    """
    if seed is None:
        seed = np.random.randint(2**31)
    assert isinstance(seed, int)

    rank, num_replicas = get_dist_info()

    if num_replicas == 1:
        return seed

    if rank == 0:
        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
    else:
        random_num = torch.tensor(0, dtype=torch.int32, device=device)
    dist.broadcast(random_num, src=0)
    return random_num.item()

@SAMPLER.register_module()
class InfiniteGroupEachSampleInBatchSampler(Sampler):
    """
    Pardon this horrendous name. Basically, we want every sample to be from its own group.
    If batch size is 4 and # of GPUs is 8, each sample of these 32 should be operating on
    its own group.
    Shuffling is only done for group order, not done within groups.
    Arguments:
        dataset: Dataset used for sampling.
        min_len: Minimum sequence sampling length
        max_len: Maximum sequence sampling length
        num_iters_to_seq: After `num_iters_to_seq` iterations, 
            start sequential sampling. Default: 0
        samples_per_gpu (optional): Per gpu batchsize. Default: 1
        num_replicas (optional): Number of processes participating in
            distributed training.
        rank (optional): Rank of the current process within num_replicas.
        seed (int, optional): random seed used to shuffle the sampler if
            ``shuffle=True``. This number should be identical across all
            processes in the distributed group. Default: 0.
    """

    def __init__(self, 
                 dataset,
                 seq_split_num=-1,
                 num_iters_to_seq=0,
                 random_drop=0,
                 samples_per_gpu=1,
                 num_replicas=None,
                 rank=None,
                 seed=0):

        _rank, _num_replicas = get_dist_info()
        if num_replicas is None:
            num_replicas = _num_replicas
        if rank is None:
            rank = _rank

        self.dataset = dataset
        self.batch_size = samples_per_gpu
        self.num_replicas = num_replicas
        self.rank = rank
        self.seq_split_num = seq_split_num
        self.sub_seq_generator = torch.Generator()
        self.sub_seq_generator.manual_seed(self.rank + seed)
        self.seed = sync_random_seed(seed)
        self.random_drop = random_drop

        self.size = len(self.dataset)
        self._iters = 0
        self.num_iters_to_seq = num_iters_to_seq

        assert hasattr(self.dataset, 'flag')
        self.flag = self.dataset.flag
        self.group_sizes = np.bincount(self.flag)
        self.groups_num = len(self.group_sizes)
        self.global_batch_size = samples_per_gpu * num_replicas
        assert self.groups_num >= self.global_batch_size

        # Now, for efficiency, make a dict {group_idx: List[dataset sample_idxs]}
        self.group_idx_to_sample_idxs = {
            group_idx: np.where(self.flag == group_idx)[0].tolist()
            for group_idx in range(self.groups_num)} 

        self.group_idx_to_sample_idxs_generator = {
            group_idx: self._sample_sub_sequence(group_idx)
            for group_idx in range(self.groups_num)
        }

        # Get a generator per sample idx. Considering samples over all
        # GPUs, each sample position has its own generator 
        self.group_indices_per_global_sample_idx = [
            self._group_indices_per_global_sample_idx(self.rank * self.batch_size + local_sample_idx) 
            for local_sample_idx in range(self.batch_size)]
        
        # Keep track of a buffer of dataset sample idxs for each local sample idx
        self.buffer_per_local_sample = [[] for _ in range(self.batch_size)]


    def _infinite_group_indices(self):
        g = torch.Generator()
        g.manual_seed(self.seed)
        while True:
            yield from torch.randperm(self.groups_num, generator=g).tolist()

    def _group_indices_per_global_sample_idx(self, global_sample_idx):
        yield from itertools.islice(self._infinite_group_indices(), 
                                    global_sample_idx, 
                                    None,
                                    self.global_batch_size)

    def _sample_sub_sequence(self, group_idx):
        '''randomly split sub-sequences in a whole sequence'''

        sample_ids = self.group_idx_to_sample_idxs[group_idx]
        while True:
            if self._iters < self.num_iters_to_seq or self.seq_split_num == -1:
                shuffled = torch.randperm(len(sample_ids), generator=self.sub_seq_generator).tolist()
                yield from [[sample_ids[i]] for i in shuffled]
            
            else:
                # split the sequence into parts
                idx = torch.randperm(len(sample_ids), generator=self.sub_seq_generator).tolist()
                idx.remove(0)
                idx = sorted(idx[:self.seq_split_num - 1]) # choose n-1 split position
                split_idx = [0] + idx + [len(sample_ids)]
                sub_seq_idx = [sample_ids[split_idx[i]: split_idx[i + 1]] 
                            for i in range(len(split_idx) - 1)] # [[1,2,3], [4,5], ...]
                shuffled = torch.randperm(len(sub_seq_idx), generator=self.sub_seq_generator).tolist()
                for i in shuffled:
                    sub_seq = sub_seq_idx[i]
                    length = len(sub_seq)
                    drop_num = math.floor(length * self.random_drop)
                    drop_idxs = torch.randperm(length, generator=self.sub_seq_generator).tolist()[:drop_num]
                    new_sub_seq = [sub_seq[j] for j in range(length) if j not in drop_idxs]
                    yield new_sub_seq
                # yield from [sub_seq_idx[i] for i in shuffled]
        

    def __iter__(self):
        last_group_idx_batch = [-1 for i in range(self.batch_size)]
        while True:
            curr_batch = []
            for local_sample_idx in range(self.batch_size):
                if len(self.buffer_per_local_sample[local_sample_idx]) == 0:
                    # Finished current group, refill with next group
                    new_group_idx = next(self.group_indices_per_global_sample_idx[local_sample_idx])

                    # 保证不会连续两段相同的序列
                    # 如果不加的话，在epoch轮换时会有概率连续两段相同序列
                    if new_group_idx == last_group_idx_batch[local_sample_idx]:
                        new_group_idx = next(self.group_indices_per_global_sample_idx[local_sample_idx])
                    last_group_idx_batch[local_sample_idx] = new_group_idx

                    self.buffer_per_local_sample[local_sample_idx] = \
                        copy.deepcopy(next(self.group_idx_to_sample_idxs_generator[new_group_idx]))

                curr_batch.append(self.buffer_per_local_sample[local_sample_idx].pop(0))
            
            self._iters += 1
            yield curr_batch

    def __len__(self):
        """Length of base dataset."""
        return self.size
        
    def set_epoch(self, epoch):
        self.epoch = epoch

================================================
FILE: plugin/datasets/samplers/sampler.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
#  Modified by Shihao Wang
# ---------------------------------------------
from mmcv.utils.registry import Registry, build_from_cfg

SAMPLER = Registry('sampler')


def build_sampler(cfg, default_args):
    return build_from_cfg(cfg, SAMPLER, default_args)


================================================
FILE: plugin/datasets/visualize/renderer.py
================================================
import os.path as osp
import os
import av2.geometry.interpolate as interp_utils
import numpy as np
import copy
import cv2
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image

matplotlib.use('agg') # prevent memory leak for drawing figures in a loop

def remove_nan_values(uv):
    is_u_valid = np.logical_not(np.isnan(uv[:, 0]))
    is_v_valid = np.logical_not(np.isnan(uv[:, 1]))
    is_uv_valid = np.logical_and(is_u_valid, is_v_valid)

    uv_valid = uv[is_uv_valid]
    return uv_valid

def points_ego2img(pts_ego, extrinsics, intrinsics):
    pts_ego_4d = np.concatenate([pts_ego, np.ones([len(pts_ego), 1])], axis=-1)
    pts_cam_4d = extrinsics @ pts_ego_4d.T
    
    uv = (intrinsics @ pts_cam_4d[:3, :]).T
    uv = remove_nan_values(uv)
    depth = uv[:, 2]
    uv = uv[:, :2] / uv[:, 2].reshape(-1, 1)

    return uv, depth

def draw_polyline_ego_on_img(polyline_ego, img_bgr, extrinsics, intrinsics, color_bgr, thickness):
    if polyline_ego.shape[1] == 2:
        zeros = np.zeros((polyline_ego.shape[0], 1))
        polyline_ego = np.concatenate([polyline_ego, zeros], axis=1)

    polyline_ego = interp_utils.interp_arc(t=500, points=polyline_ego)
    
    uv, depth = points_ego2img(polyline_ego, extrinsics, intrinsics)

    h, w, c = img_bgr.shape

    is_valid_x = np.logical_and(0 <= uv[:, 0], uv[:, 0] < w - 1)
    is_valid_y = np.logical_and(0 <= uv[:, 1], uv[:, 1] < h - 1)
    is_valid_z = depth > 0
    is_valid_points = np.logical_and.reduce([is_valid_x, is_valid_y, is_valid_z])

    if is_valid_points.sum() == 0:
        return
    
    uv = np.round(uv[is_valid_points]).astype(np.int32)

    draw_visible_polyline_cv2(
        copy.deepcopy(uv),
        valid_pts_bool=np.ones((len(uv), 1), dtype=bool),
        image=img_bgr,
        color=color_bgr,
        thickness_px=thickness,
    )

def draw_visible_polyline_cv2(line, valid_pts_bool, image, color, thickness_px):
    """Draw a polyline onto an image using given line segments.

    Args:
        line: Array of shape (K, 2) representing the coordinates of line.
        valid_pts_bool: Array of shape (K,) representing which polyline coordinates are valid for rendering.
            For example, if the coordinate is occluded, a user might specify that it is invalid.
            Line segments touching an invalid vertex will not be rendered.
        image: Array of shape (H, W, 3), representing a 3-channel BGR image
        color: Tuple of shape (3,) with a BGR format color
        thickness_px: thickness (in pixels) to use when rendering the polyline.
    """
    line = np.round(line).astype(int)  # type: ignore
    for i in range(len(line) - 1):

        if (not valid_pts_bool[i]) or (not valid_pts_bool[i + 1]):
            continue

        x1 = line[i][0]
        y1 = line[i][1]
        x2 = line[i + 1][0]
        y2 = line[i + 1][1]

        # Use anti-aliasing (AA) for curves
        image = cv2.line(image, pt1=(x1, y1), pt2=(x2, y2), color=color, thickness=thickness_px, lineType=cv2.LINE_AA)


COLOR_MAPS_BGR = {
    # bgr colors
    'divider': (0, 0, 255),
    'boundary': (0, 255, 0),
    'ped_crossing': (255, 0, 0),
    'centerline': (51, 183, 255),
    'drivable_area': (171, 255, 255)
}

COLOR_MAPS_PLT = {
    'divider': 'r',
    'boundary': 'g',
    'ped_crossing': 'b',
    'centerline': 'orange',
    'drivable_area': 'y',
}

CAM_NAMES_AV2 = ['ring_front_center', 'ring_front_right', 'ring_front_left',
    'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left',
    ]
CAM_NAMES_NUSC = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
    'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT',]

class Renderer(object):
    """Render map elements on image views.

    Args:
        cat2id (dict): category to class id
        roi_size (tuple): bev range
        dataset (str): 'av2' or 'nusc'
    """

    def __init__(self, cat2id, roi_size, dataset='av2'):
        self.roi_size = roi_size
        self.cat2id = cat2id
        self.id2cat = {v: k for k, v in cat2id.items()}
        if dataset == 'av2':
            self.cam_names = CAM_NAMES_AV2
        else:
            self.cam_names = CAM_NAMES_NUSC

    def render_bev_from_vectors(self, vectors, out_dir, draw_scores=False, specified_path=None,
            id_info=None):
        '''Render bev segmentation using vectorized map elements.
        
        Args:
            vectors (dict): dict of vectorized map elements.
            out_dir (str): output directory
        '''

        car_img = Image.open('resources/car.png')
        #car_img = Image.open('resources/car_lidar_coord.png')
        if specified_path:
            map_path = specified_path
        else:
            map_path = os.path.join(out_dir, 'map.jpg')

        fig = plt.figure(figsize=(self.roi_size[0], self.roi_size[1]))
        ax = fig.add_subplot(1, 1, 1)
        ax.set_xlim(-self.roi_size[0] / 2, self.roi_size[0] / 2)
        ax.set_ylim(-self.roi_size[1] / 2, self.roi_size[1] / 2)
        ax.axis('off')
        #ax.imshow(car_img, extent=[-2.0, 2.0, -2.5, 2.5])
        ax.imshow(car_img, extent=[-2.5, 2.5, -2.0, 2.0])

        for label, vector_list in vectors.items():
            cat = self.id2cat[label]
            color = COLOR_MAPS_PLT[cat]
            for vec_i, vector in enumerate(vector_list):
                if draw_scores:
                    vector, score, prop = vector
                if isinstance(vector, list):
                    vector = np.array(vector)
                    from shapely.geometry import LineString
                    vector = np.array(LineString(vector).simplify(0.2).coords)
                pts = vector[:, :2]
                x = np.array([pt[0] for pt in pts])
                y = np.array([pt[1] for pt in pts])
                # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], angles='xy', color=color,
                #     scale_units='xy', scale=1)
                # for i in range(len(x)):
                ax.plot(x, y, 'o-', color=color, linewidth=20, markersize=50)
                if draw_scores:
                    #print('Prop:', prop, 'Label:', label)
                    if prop:
                        p = 'p'
                    else:
                        p = ''
                    score = round(score, 2)
                    mid_idx = len(x) // 2
                    ax.text(x[mid_idx], y[mid_idx], str(score)+p, fontsize=100, color=color)
                if id_info:
                    vec_id = id_info[label][vec_i]
                    mid_idx = len(x) // 2
                    ax.text(x[mid_idx], y[mid_idx], f'{cat[:1].upper()}{vec_id}', fontsize=100, color=color)
                    
        #plt.savefig(map_path, bbox_inches='tight', dpi=40)
        fig.savefig(map_path, bbox_inches='tight', dpi=20)
        plt.clf()  # or cla() to simulate use case of plotting fresh figures
        
    def render_camera_views_from_vectors(self, vectors, imgs, extrinsics, 
            intrinsics, ego2cams, thickness, out_dir):
        '''Project vectorized map elements to camera views.
        
        Args:
            vectors (dict): dict of vectorized map elements.
            imgs (tensor): images in bgr color.
            extrinsics (array): ego2img extrinsics, shape (4, 4)
            intrinsics (array): intrinsics, shape (3, 3) 
            thickness (int): thickness of lines to draw on images.
            out_dir (str): output directory
        '''

        for i in range(len(imgs)):
            img = imgs[i]
            extrinsic = extrinsics[i]
            intrinsic = intrinsics[i]
            ego2cam = ego2cams[i]
            img_bgr = copy.deepcopy(img)

            for label, vector_list in vectors.items():
                cat = self.id2cat[label]
                color = COLOR_MAPS_BGR[cat]
                for vector in vector_list:
                    img_bgr = np.ascontiguousarray(img_bgr)
                    if isinstance(vector, list):
                        vector = np.array(vector)
                    draw_polyline_ego_on_img(vector, img_bgr, ego2cam, intrinsic, color, thickness)
                    
            out_path = osp.join(out_dir, self.cam_names[i]) + '.jpg'
            cv2.imwrite(out_path, img_bgr)

    def render_bev_from_mask(self, semantic_mask, out_dir, flip=False):
        '''Render bev segmentation from semantic_mask.
        
        Args:
            semantic_mask (array): semantic mask.
            out_dir (str): output directory
        '''

        
        if len(semantic_mask.shape) == 3:
            c, h, w = semantic_mask.shape
        else:
            h, w = semantic_mask.shape
        
        bev_img = np.ones((3, h, w), dtype=np.uint8) * 255
        if 'drivable_area' in self.cat2id:
            drivable_area_mask = semantic_mask[self.cat2id['drivable_area']]
            bev_img[:, drivable_area_mask == 1] = \
                    np.array(COLOR_MAPS_BGR['drivable_area']).reshape(3, 1)
        
        for label in self.id2cat:
            cat = self.id2cat[label]
            if cat == 'drivable_area':
                continue
            if len(semantic_mask.shape) == 3:
                valid = (semantic_mask[label] == 1)
            else:
                valid = semantic_mask == (label + 1)
            bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1)

        #for label in range(c):
        #    cat = self.id2cat[label]
        #    if cat == 'drivable_area':
        #        continue
        #    mask = semantic_mask[label]
        #    valid = mask == 1
        #    bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1)

        out_path = osp.join(out_dir, 'semantic_map.jpg')
        if flip:
            bev_img_flipud = np.array([np.flipud(i) for i in bev_img], dtype=np.uint8)
            cv2.imwrite(out_path, bev_img_flipud.transpose((1, 2, 0)))
        else:
            cv2.imwrite(out_path, bev_img.transpose((1, 2, 0)))
            
        
================================================
FILE: plugin/models/__init__.py
================================================
from .backbones import *
from .heads import *
from .necks import *
from .losses import *
from .mapers import *
from .transformer_utils import *
from .assigner import *
from .utils import *

================================================
FILE: plugin/models/assigner/__init__.py
================================================
from .assigner import HungarianLinesAssigner
from .match_cost import MapQueriesCost, BBoxLogitsCost, DynamicLinesCost, IoUCostC, BBoxCostC, LinesL1Cost, LinesFixNumChamferCost, ClsSigmoidCost


================================================
FILE: plugin/models/assigner/assigner.py
================================================
import torch

from mmdet.core.bbox.builder import BBOX_ASSIGNERS
from mmdet.core.bbox.assigners import AssignResult
from mmdet.core.bbox.assigners import BaseAssigner
from mmdet.core.bbox.match_costs import build_match_cost
from scipy.optimize import linear_sum_assignment

import numpy as np

@BBOX_ASSIGNERS.register_module()
class HungarianLinesAssigner(BaseAssigner):
    """
        Computes one-to-one matching between predictions and ground truth.
        This class computes an assignment between the targets and the predictions
        based on the costs. The costs are weighted sum of three components:
        classification cost and regression L1 cost. The
        targets don't include the no_object, so generally there are more
        predictions than targets. After the one-to-one matching, the un-matched
        are treated as backgrounds. Thus each query prediction will be assigned
        with `0` or a positive integer indicating the ground truth index:
        - 0: negative sample, no assigned gt
        - positive integer: positive sample, index (1-based) of assigned gt
        Args:
            cls_weight (int | float, optional): The scale factor for classification
                cost. Default 1.0.
            bbox_weight (int | float, optional): The scale factor for regression
                L1 cost. Default 1.0.
    """

    def __init__(self,
                 cost=dict(
                     type='MapQueriesCost',
                     cls_cost=dict(type='ClassificationCost', weight=1.),
                     reg_cost=dict(type='LinesCost', weight=1.0),
                    ),
                 **kwargs):

        self.cost = build_match_cost(cost)

    def assign(self,
               preds: dict,
               gts: dict,
               track_info=None,
               gt_bboxes_ignore=None,
               eps=1e-7):
        """
            Computes one-to-one matching based on the weighted costs.
            This method assign each query prediction to a ground truth or
            background. The `assigned_gt_inds` with -1 means don't care,
            0 means negative sample, and positive number is the index (1-based)
            of assigned gt.
            The assignment is done in the following steps, the order matters.
            1. assign every prediction to -1
            2. compute the weighted costs
            3. do Hungarian matching on CPU based on the costs
            4. assign all to 0 (background) first, then for each matched pair
            between predictions and gts, treat this prediction as foreground
            and assign the corresponding gt index (plus 1) to it.
            Args:
                lines_pred (Tensor): predicted normalized lines:
                    [num_query, num_points, 2]
                cls_pred (Tensor): Predicted classification logits, shape
                    [num_query, num_class].

                lines_gt (Tensor): Ground truth lines
                    [num_gt, num_points, 2].
                labels_gt (Tensor): Label of `gt_bboxes`, shape (num_gt,).
                gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
                    labelled as `ignored`. Default None.
                eps (int | float, optional): A value added to the denominator for
                    numerical stability. Default 1e-7.
            Returns:
                :obj:`AssignResult`: The assigned result.
        """
        assert gt_bboxes_ignore is None, \
            'Only case when gt_bboxes_ignore is None is supported.'
        
        num_gts, num_lines = gts['lines'].size(0), preds['lines'].size(0)

        # 1. assign -1 by default
        assigned_gt_inds = \
            preds['lines'].new_full((num_lines,), -1, dtype=torch.long)
        assigned_labels = \
            preds['lines'].new_full((num_lines,), -1, dtype=torch.long)

        if num_gts == 0 or num_lines == 0:
            # No ground truth or boxes, return empty assignment
            if num_gts == 0:
                # No ground truth, assign all to background
                assigned_gt_inds[:] = 0
            return AssignResult(
                num_gts, assigned_gt_inds, None, labels=assigned_labels), None

        # 2. compute the weighted costs
        gt_permute_idx = None # (num_preds, num_gts)
        if self.cost.reg_cost.permute:
            cost, gt_permute_idx, reg_cost = self.cost(preds, gts)
        else:
            cost, reg_cost = self.cost(preds, gts)
        
        # Manipulate the cost matrix here using the two-frame matching info
        # for non-first-frame supervision
        if track_info is not None:
            prop_i = 0
            # iterate through queries
            for j in range(cost.shape[0]):
                if j >= len(track_info['track_queries_fal_pos_mask']):
                    # padding queries, loss will be filtered later
                    cost[j] = np.inf
                    continue
                    
                if track_info['track_queries_fal_pos_mask'][j]:
                    # false positive and palceholder track queries should not
                    # be matched to any target
                    cost[j] = np.inf
                
                # Tweak the cost matrix here to force the G.T. assignment of the track queries
                elif track_info['track_queries_mask'][j]:
                    track_query_id = track_info['track_query_match_ids'][prop_i].long().item()
                    prop_i += 1

                    cost[j] = np.inf
                    cost[:, track_query_id] = np.inf
                    cost[j, track_query_id] = -1

        # 3. do Hungarian matching on CPU using linear_sum_assignment
        cost = cost.detach().cpu().numpy()
        if linear_sum_assignment is None:
            raise ImportError('Please run "pip install scipy" '
                              'to install scipy first.')
        try:
            matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
        except:
            print('cost max{}, min{}'.format(cost.max(), cost.min()))
            import pdb; pdb.set_trace()

        matched_row_inds = torch.from_numpy(matched_row_inds).to(
            preds['lines'].device)
        matched_col_inds = torch.from_numpy(matched_col_inds).to(
            preds['lines'].device)
        
        # Pass out the un-weighted reg cost for temporal propagation
        mathced_reg_cost = reg_cost[matched_row_inds, matched_col_inds]        

        # 4. assign backgrounds and foregrounds
        # assign all indices to backgrounds first
        assigned_gt_inds[:] = 0
        # assign foregrounds based on matching results
        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
        assigned_labels[matched_row_inds] = gts['labels'][matched_col_inds]
        return AssignResult(
            num_gts, assigned_gt_inds, None, labels=assigned_labels), gt_permute_idx, mathced_reg_cost

================================================
FILE: plugin/models/assigner/match_cost.py
================================================
import torch
from mmdet.core.bbox.match_costs.builder import MATCH_COST
from mmdet.core.bbox.match_costs import build_match_cost
from torch.nn.functional import smooth_l1_loss

from mmdet.core.bbox.iou_calculators import bbox_overlaps
from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy
def chamfer_distance(line1, line2) -> float:
    ''' Calculate chamfer distance between two lines. Make sure the 
    lines are interpolated.

    Args:
        line1 (tensor): shape (num_pts, 2)
        line2 (tensor): shape (num_pts, 2)
    
    Returns:
        distance (float): chamfer distance
    '''
    
    dist_matrix = torch.cdist(line1, line2, p=2)
    dist12 = dist_matrix.min(-1)[0].sum() / len(line1)
    dist21 = dist_matrix.min(-2)[0].sum() / len(line2)

    return (dist12 + dist21) / 2


@MATCH_COST.register_module()
class ClsSigmoidCost:
    """ClsSoftmaxCost.
     Args:
         weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1.):
        self.weight = weight

    def __call__(self, cls_pred, gt_labels):
        """
        Args:
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
        Returns:
            torch.Tensor: cls_cost value with weight
        """
        # Following the official DETR repo, contrary to the loss that
        # NLL is used, we approximate it in 1 - cls_score[gt_label].
        # The 1 is a constant that doesn't change the matching,
        # so it can be omitted.
        cls_score = cls_pred.sigmoid()
        cls_cost = -cls_score[:, gt_labels]
        return cls_cost * self.weight


@MATCH_COST.register_module()
class LinesFixNumChamferCost(object):
    """BBox3DL1Cost.
     Args:
         weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1.0, permute=False):
        self.weight = weight
        self.permute = permute

    def __call__(self, lines_pred, gt_lines):
        """
        Args:
            lines_pred (Tensor): predicted normalized lines:
                [num_query, 2*num_points]
            gt_lines (Tensor): Ground truth lines
                [num_gt, 2*num_points] or [num_gt, num_permute, 2*num_points]
        Returns:
            torch.Tensor: reg_cost value with weight
                shape [num_pred, num_gt]
        """

        if self.permute:
            assert len(gt_lines.shape) == 3
        else:
            assert len(gt_lines.shape) == 2
        
        num_gt, num_pred = len(gt_lines), len(lines_pred)
        if self.permute:
            gt_lines = gt_lines.flatten(0, 1) # (num_gt*num_permute, 2*num_pts)

        num_pts = lines_pred.shape[-1] // 2
        lines_pred = lines_pred.view(-1, 2) # [num_query*num_points, 2]
        gt_lines = gt_lines.view(-1, 2) # [num_gt*num_points, 2]
        
        dist_mat = torch.cdist(lines_pred, gt_lines, p=2) # (num_query*num_points, num_gt*num_points)
        dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) # (num_gt, num_query*num_points, num_pts)
        dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=1)) # (num_q, num_gt, num_pts, num_pts)

        dist1 = dist_mat.min(-1)[0].sum(-1)
        dist2 = dist_mat.min(-2)[0].sum(-1)

        dist_mat = (dist1 + dist2) / (2 * num_pts) # (num_pred, num_gt)

        if self.permute:
            # dist_mat: (num_pred, num_gt*num_permute)
            dist_mat = dist_mat.view(num_pred, num_gt, -1) # (num_pred, num_gt, num_permute)
            dist_mat, gt_permute_index = dist_mat.min(-1)
            return dist_mat * self.weight, gt_permute_index

        return dist_mat * self.weight


@MATCH_COST.register_module()
class LinesL1Cost(object):
    """LinesL1Cost.
     Args:
         weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1.0, beta=0.0, permute=False):
        self.weight = weight
        self.permute = permute
        self.beta = beta

    def __call__(self, lines_pred, gt_lines, **kwargs):
        """
        Args:
            lines_pred (Tensor): predicted normalized lines:
                [num_query, 2*num_points]
            gt_lines (Tensor): Ground truth lines
                [num_gt, 2*num_points] or [num_gt, num_permute, 2*num_points]
        Returns:
            torch.Tensor: reg_cost value with weight
                shape [num_pred, num_gt]
        """
        
        if self.permute:
            assert len(gt_lines.shape) == 3
        else:
            assert len(gt_lines.shape) == 2

        num_pred, num_gt = len(lines_pred), len(gt_lines)
        if self.permute:
            # permute-invarint labels
            gt_lines = gt_lines.flatten(0, 1) # (num_gt*num_permute, 2*num_pts)

        num_pts = lines_pred.shape[-1]//2

        if self.beta > 0:
            lines_pred = lines_pred.unsqueeze(1).repeat(1, len(gt_lines), 1)
            gt_lines = gt_lines.unsqueeze(0).repeat(num_pred, 1, 1)
            dist_mat = smooth_l1_loss(lines_pred, gt_lines, reduction='none', beta=self.beta).sum(-1)
        
        else:
            dist_mat = torch.cdist(lines_pred, gt_lines, p=1)

        dist_mat = dist_mat / num_pts

        if self.permute:
            # dist_mat: (num_pred, num_gt*num_permute)
            dist_mat = dist_mat.view(num_pred, num_gt, -1) # (num_pred, num_gt, num_permute)
            dist_mat, gt_permute_index = torch.min(dist_mat, 2)
            return dist_mat * self.weight, gt_permute_index
        
        return dist_mat * self.weight


@MATCH_COST.register_module()
class BBoxCostC:
    """BBoxL1Cost.
     Args:
         weight (int | float, optional): loss_weight
         box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
     Examples:
         >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
         >>> import torch
         >>> self = BBoxL1Cost()
         >>> bbox_pred = torch.rand(1, 4)
         >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
         >>> factor = torch.tensor([10, 8, 10, 8])
         >>> self(bbox_pred, gt_bboxes, factor)
         tensor([[1.6172, 1.6422]])
    """

    def __init__(self, weight=1., box_format='xyxy'):
        self.weight = weight
        assert box_format in ['xyxy', 'xywh']
        self.box_format = box_format

    def __call__(self, bbox_pred, gt_bboxes):
        """
        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            gt_bboxes (Tensor): Ground truth boxes with normalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
        Returns:
            torch.Tensor: bbox_cost value with weight
        """
        # if self.box_format == 'xywh':
        #     gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
        # elif self.box_format == 'xyxy':
        #     bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
        return bbox_cost * self.weight


@MATCH_COST.register_module()
class IoUCostC:
    """IoUCost.
     Args:
         iou_mode (str, optional): iou mode such as 'iou' | 'giou'
         weight (int | float, optional): loss weight
     Examples:
         >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
         >>> import torch
         >>> self = IoUCost()
         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
         >>> self(bboxes, gt_bboxes)
         tensor([[-0.1250,  0.1667],
                [ 0.1667, -0.5000]])
    """

    def __init__(self, iou_mode='giou', weight=1., box_format='xywh'):
        self.weight = weight
        self.iou_mode = iou_mode
        assert box_format in ['xyxy', 'xywh']
        self.box_format = box_format

    def __call__(self, bboxes, gt_bboxes):
        """
        Args:
            bboxes (Tensor): Predicted boxes with unnormalized coordinates
                (x1, y1, x2, y2). Shape [num_query, 4].
            gt_bboxes (Tensor): Ground truth boxes with unnormalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
        Returns:
            torch.Tensor: iou_cost value with weight
        """
        if self.box_format == 'xywh':
            bboxes = bbox_cxcywh_to_xyxy(bboxes)
            gt_bboxes = bbox_cxcywh_to_xyxy(gt_bboxes)

        # overlaps: [num_bboxes, num_gt]
        overlaps = bbox_overlaps(
            bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
        # The 1 is a constant that doesn't change the matching, so omitted.
        iou_cost = -overlaps
        return iou_cost * self.weight

@MATCH_COST.register_module()
class DynamicLinesCost(object):
    """LinesL1Cost.
     Args:
         weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1.):
        self.weight = weight

    def __call__(self, lines_pred, lines_gt, masks_pred, masks_gt):
        """
        Args:
            lines_pred (Tensor): predicted normalized lines:
                [nP, num_points, 2]
            lines_gt (Tensor): Ground truth lines
                [nG, num_points, 2]
            masks_pred: [nP, num_points]
            masks_gt: [nG, num_points]
        Returns:
            dist_mat: reg_cost value with weight
                shape [nP, nG]
        """

        dist_mat = self.cal_dist(lines_pred, lines_gt)

        dist_mat = self.get_dynamic_line(dist_mat, masks_pred, masks_gt)

        dist_mat = dist_mat * self.weight

        return dist_mat

    def cal_dist(self, x1, x2):
        '''
            Args:
                x1: B1,N,2
                x2: B2,N,2
            Return:
                dist_mat: B1,B2,N
        '''
        x1 = x1.permute(1, 0, 2)
        x2 = x2.permute(1, 0, 2)

        dist_mat = torch.cdist(x1, x2, p=2)

        dist_mat = dist_mat.permute(1, 2, 0)

        return dist_mat

    def get_dynamic_line(self, mat, m1, m2):
        '''
            get dynamic line with difference approach
            mat: N1xN2xnpts
            m1: N1xnpts
            m2: N2xnpts
        '''

        # nPxnGxnum_points
        m1 = m1.unsqueeze(1).sigmoid() > 0.5
        m2 = m2.unsqueeze(0)

        valid_points_mask = (m1 + m2)/2.

        average_factor_mask = valid_points_mask.sum(-1) > 0
        average_factor = average_factor_mask.masked_fill(
            ~average_factor_mask, 1)

        # takes the average
        mat = mat * valid_points_mask
        mat = mat.sum(-1) / average_factor

        return mat


@MATCH_COST.register_module()
class BBoxLogitsCost(object):
    """BBoxLogits.
     Args:
         weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1.):
        self.weight = weight

    def calNLL(self, logits, value):
        '''
            Args:
                logits: B1, 8, cls_dim
                value: B2, 8,
            Return:
                log_likelihood: B1,B2,8
        '''

        logits = logits[:, None]
        value = value[None]

        value = value.long().unsqueeze(-1)
        value, log_pmf = torch.broadcast_tensors(value, logits)
        value = value[..., :1]
        return log_pmf.gather(-1, value).squeeze(-1)

    def __call__(self, bbox_pred, bbox_gt, **kwargs):
        """
        Args:
            bbox_pred: nproposal, 4*2, pos_dim
            bbox_gt: ngt, 4*2
        Returns:
            cost: nproposal, ngt
        """

        cost = self.calNLL(bbox_pred, bbox_gt).mean(-1)

        return cost * self.weight


@MATCH_COST.register_module()
class MapQueriesCost(object):

    def __init__(self, cls_cost, reg_cost, iou_cost=None):

        self.cls_cost = build_match_cost(cls_cost)
        self.reg_cost = build_match_cost(reg_cost)

        self.iou_cost = None
        if iou_cost is not None:
            self.iou_cost = build_match_cost(iou_cost)

    def __call__(self, preds: dict, gts: dict):

        # classification and bboxcost.
        cls_cost = self.cls_cost(preds['scores'], gts['labels'])

        # regression cost
        regkwargs = {}
        if 'masks' in preds and 'masks' in gts:
            assert isinstance(self.reg_cost, DynamicLinesCost), ' Issues!!'
            regkwargs = {
                'masks_pred': preds['masks'],
                'masks_gt': gts['masks'],
            }

        reg_cost = self.reg_cost(preds['lines'], gts['lines'], **regkwargs)
        if self.reg_cost.permute:
            reg_cost, gt_permute_idx = reg_cost

        # weighted sum of above three costs
        cost = cls_cost + reg_cost

        # Need to pass the reg cost out, and use this to filter deviated
        # instances for temporal label assignment...
        raw_reg_cost = reg_cost / self.reg_cost.weight

        # Iou
        if self.iou_cost is not None:
            iou_cost = self.iou_cost(preds['lines'],gts['lines'])
            cost += iou_cost
        
        if self.reg_cost.permute:
            return cost, gt_permute_idx, raw_reg_cost
        return cost, raw_reg_cost


================================================
FILE: plugin/models/backbones/__init__.py
================================================
from .bevformer_backbone import BEVFormerBackbone


================================================
FILE: plugin/models/backbones/bevformer/__init__.py
================================================
from .custom_base_transformer_layer import MyCustomBaseTransformerLayer
from .encoder import BEVFormerEncoder
from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D, MSIPM3D
from .temporal_self_attention import TemporalSelfAttention
from .transformer import PerceptionTransformer
from .temporal_net import TemporalNet

================================================
FILE: plugin/models/backbones/bevformer/custom_base_transformer_layer.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------

import copy
import warnings

import torch
import torch.nn as nn

from mmcv import ConfigDict, deprecated_api_warning
from mmcv.cnn import Linear, build_activation_layer, build_norm_layer
from mmcv.runner.base_module import BaseModule, ModuleList, Sequential

from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
                                      TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)

# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
try:
    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
    warnings.warn(
        ImportWarning(
            '``MultiScaleDeformableAttention`` has been moved to '
            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
        ))
except ImportError:
    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
                  '``mmcv.ops.multi_scale_deform_attn``, '
                  'You should install ``mmcv-full`` if you need this module. ')
from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention


@TRANSFORMER_LAYER.register_module()
class MyCustomBaseTransformerLayer(BaseModule):
    """Base `TransformerLayer` for vision transformer.
    It can be built from `mmcv.ConfigDict` and support more flexible
    customization, for example, using any number of `FFN or LN ` and
    use different kinds of `attention` by specifying a list of `ConfigDict`
    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
    when you specifying `norm` as the first element of `operation_order`.
    More details about the `prenorm`: `On Layer Normalization in the
    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
    Args:
        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for `self_attention` or `cross_attention` modules,
            The order of the configs in the list should be consistent with
            corresponding attentions in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config. Default: None.
        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for FFN, The order of the configs in the list should be
            consistent with corresponding ffn in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config.
        operation_order (tuple[str]): The execution order of operation
            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
            Support `prenorm` when you specifying first element as `norm`.
            Default：None.
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='LN').
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        batch_first (bool): Key, Query and Value are shape
            of (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
    """

    def __init__(self,
                 attn_cfgs=None,
                 ffn_cfgs=dict(
                     type='FFN',
                     embed_dims=256,
                     feedforward_channels=1024,
                     num_fcs=2,
                     ffn_drop=0.,
                     act_cfg=dict(type='ReLU', inplace=True),
                 ),
                 operation_order=None,
                 norm_cfg=dict(type='LN'),
                 init_cfg=None,
                 batch_first=True,
                 **kwargs):

        deprecated_args = dict(
            feedforward_channels='feedforward_channels',
            ffn_dropout='ffn_drop',
            ffn_num_fcs='num_fcs')
        for ori_name, new_name in deprecated_args.items():
            if ori_name in kwargs:
                warnings.warn(
                    f'The arguments `{ori_name}` in BaseTransformerLayer '
                    f'has been deprecated, now you should set `{new_name}` '
                    f'and other FFN related arguments '
                    f'to a dict named `ffn_cfgs`. ')
                ffn_cfgs[new_name] = kwargs[ori_name]

        super(MyCustomBaseTransformerLayer, self).__init__(init_cfg)

        self.batch_first = batch_first

        assert set(operation_order) & set(
            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
            set(operation_order), f'The operation_order of' \
            f' {self.__class__.__name__} should ' \
            f'contains all four operation type ' \
            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"

        num_attn = operation_order.count('self_attn') + operation_order.count(
            'cross_attn')
        if isinstance(attn_cfgs, dict):
            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
        else:
            assert num_attn == len(attn_cfgs), f'The length ' \
                f'of attn_cfg {num_attn} is ' \
                f'not consistent with the number of attention' \
                f'in operation_order {operation_order}.'

        self.num_attn = num_attn
        self.operation_order = operation_order
        self.norm_cfg = norm_cfg
        self.pre_norm = operation_order[0] == 'norm'
        self.attentions = ModuleList()

        index = 0
        for operation_name in operation_order:
            if operation_name in ['self_attn', 'cross_attn']:
                if 'batch_first' in attn_cfgs[index]:
                    assert self.batch_first == attn_cfgs[index]['batch_first']
                else:
                    attn_cfgs[index]['batch_first'] = self.batch_first
                attention = build_attention(attn_cfgs[index])
                # Some custom attentions used as `self_attn`
                # or `cross_attn` can have different behavior.
                attention.operation_name = operation_name
                self.attentions.append(attention)
                index += 1

        self.embed_dims = self.attentions[0].embed_dims

        self.ffns = ModuleList()
        num_ffns = operation_order.count('ffn')
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = ConfigDict(ffn_cfgs)
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
        assert len(ffn_cfgs) == num_ffns
        for ffn_index in range(num_ffns):
            if 'embed_dims' not in ffn_cfgs[ffn_index]:
                ffn_cfgs['embed_dims'] = self.embed_dims
            else:
                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims

            self.ffns.append(
                build_feedforward_network(ffn_cfgs[ffn_index]))

        self.norms = ModuleList()
        num_norms = operation_order.count('norm')
        for _ in range(num_norms):
            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])

    def forward(self,
                query,
                key=None,
                value=None,
                query_pos=None,
                key_pos=None,
                attn_masks=None,
                query_key_padding_mask=None,
                key_padding_mask=None,
                **kwargs):
        """Forward function for `TransformerDecoderLayer`.
        **kwargs contains some specific arguments of attentions.
        Args:
            query (Tensor): The input query with shape
                [num_queries, bs, embed_dims] if
                self.batch_first is False, else
                [bs, num_queries embed_dims].
            key (Tensor): The key tensor with shape [num_keys, bs,
                embed_dims] if self.batch_first is False, else
                [bs, num_keys, embed_dims] .
            value (Tensor): The value tensor with same shape as `key`.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`.
                Default: None.
            attn_masks (List[Tensor] | None): 2D Tensor used in
                calculation of corresponding attention. The length of
                it should equal to the number of `attention` in
                `operation_order`. Default: None.
            query_key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_queries]. Only used in `self_attn` layer.
                Defaults to None.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_keys]. Default: None.
        Returns:
            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
        """

        norm_index = 0
        attn_index = 0
        ffn_index = 0
        identity = query
        if attn_masks is None:
            attn_masks = [None for _ in range(self.num_attn)]
        elif isinstance(attn_masks, torch.Tensor):
            attn_masks = [
                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
            ]
            warnings.warn(f'Use same attn_mask in all attentions in '
                          f'{self.__class__.__name__} ')
        else:
            assert len(attn_masks) == self.num_attn, f'The length of ' \
                f'attn_masks {len(attn_masks)} must be equal ' \
                f'to the number of attention in ' \
                f'operation_order {self.num_attn}'

        for layer in self.operation_order:
            if layer == 'self_attn':
                temp_key = temp_value = query
                query = self.attentions[attn_index](
                    query,
                    temp_key,
                    temp_value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=query_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=query_key_padding_mask,
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'norm':
                query = self.norms[norm_index](query)
                norm_index += 1

            elif layer == 'cross_attn':
                query = self.attentions[attn_index](
                    query,
                    key,
                    value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=key_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=key_padding_mask,
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'ffn':
                query = self.ffns[ffn_index](
                    query, identity if self.pre_norm else None)
                ffn_index += 1

        return query


@TRANSFORMER_LAYER.register_module()
class MyCustomBaseTransformerLayerWithoutSelfAttn(BaseModule):
    """Base `TransformerLayer` for vision transformer.
    It can be built from `mmcv.ConfigDict` and support more flexible
    customization, for example, using any number of `FFN or LN ` and
    use different kinds of `attention` by specifying a list of `ConfigDict`
    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
    when you specifying `norm` as the first element of `operation_order`.
    More details about the `prenorm`: `On Layer Normalization in the
    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
    Args:
        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for `self_attention` or `cross_attention` modules,
            The order of the configs in the list should be consistent with
            corresponding attentions in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config. Default: None.
        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for FFN, The order of the configs in the list should be
            consistent with corresponding ffn in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config.
        operation_order (tuple[str]): The execution order of operation
            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
            Support `prenorm` when you specifying first element as `norm`.
            Default：None.
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='LN').
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        batch_first (bool): Key, Query and Value are shape
            of (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
    """

    def __init__(self,
                 attn_cfgs=None,
                 ffn_cfgs=dict(
                     type='FFN',
                     embed_dims=256,
                     feedforward_channels=1024,
                     num_fcs=2,
                     ffn_drop=0.,
                     act_cfg=dict(type='ReLU', inplace=True),
                 ),
                 operation_order=None,
                 norm_cfg=dict(type='LN'),
                 init_cfg=None,
                 batch_first=True,
                 **kwargs):

        deprecated_args = dict(
            feedforward_channels='feedforward_channels',
            ffn_dropout='ffn_drop',
            ffn_num_fcs='num_fcs')
        for ori_name, new_name in deprecated_args.items():
            if ori_name in kwargs:
                warnings.warn(
                    f'The arguments `{ori_name}` in BaseTransformerLayer '
                    f'has been deprecated, now you should set `{new_name}` '
                    f'and other FFN related arguments '
                    f'to a dict named `ffn_cfgs`. ')
                ffn_cfgs[new_name] = kwargs[ori_name]

        super(MyCustomBaseTransformerLayerWithoutSelfAttn, self).__init__(init_cfg)

        self.batch_first = batch_first

        assert set(operation_order) & set(
            ['norm', 'ffn', 'cross_attn']) == \
            set(operation_order), f'The operation_order of' \
            f' {self.__class__.__name__} should ' \
            f'contains all three operation type ' \
            f"{['norm', 'ffn', 'cross_attn']}"

        num_attn = operation_order.count(
            'cross_attn')
        if isinstance(attn_cfgs, dict):
            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
        else:
            assert num_attn == len(attn_cfgs), f'The length ' \
                f'of attn_cfg {num_attn} is ' \
                f'not consistent with the number of attention' \
                f'in operation_order {operation_order}.'

        self.num_attn = num_attn
        self.operation_order = operation_order
        self.norm_cfg = norm_cfg
        self.pre_norm = operation_order[0] == 'norm'
        self.attentions = ModuleList()

        index = 0
        for operation_name in operation_order:
            if operation_name in ['self_attn', 'cross_attn']:
                if 'batch_first' in attn_cfgs[index]:
                    assert self.batch_first == attn_cfgs[index]['batch_first']
                else:
                    attn_cfgs[index]['batch_first'] = self.batch_first
                attention = build_attention(attn_cfgs[index])
                # Some custom attentions used as `self_attn`
                # or `cross_attn` can have different behavior.
                attention.operation_name = operation_name
                self.attentions.append(attention)
                index += 1

        self.embed_dims = self.attentions[0].embed_dims

        self.ffns = ModuleList()
        num_ffns = operation_order.count('ffn')
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = ConfigDict(ffn_cfgs)
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
        assert len(ffn_cfgs) == num_ffns
        for ffn_index in range(num_ffns):
            if 'embed_dims' not in ffn_cfgs[ffn_index]:
                ffn_cfgs['embed_dims'] = self.embed_dims
            else:
                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims

            self.ffns.append(
                build_feedforward_network(ffn_cfgs[ffn_index]))

        self.norms = ModuleList()
        num_norms = operation_order.count('norm')
        for _ in range(num_norms):
            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])

    def forward(self,
                query,
                key=None,
                value=None,
                query_pos=None,
                key_pos=None,
                attn_masks=None,
                query_key_padding_mask=None,
                key_padding_mask=None,
                **kwargs):
        """Forward function for `TransformerDecoderLayer`.
        **kwargs contains some specific arguments of attentions.
        Args:
            query (Tensor): The input query with shape
                [num_queries, bs, embed_dims] if
                self.batch_first is False, else
                [bs, num_queries embed_dims].
            key (Tensor): The key tensor with shape [num_keys, bs,
                embed_dims] if self.batch_first is False, else
                [bs, num_keys, embed_dims] .
            value (Tensor): The value tensor with same shape as `key`.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`.
                Default: None.
            attn_masks (List[Tensor] | None): 2D Tensor used in
                calculation of corresponding attention. The length of
                it should equal to the number of `attention` in
                `operation_order`. Default: None.
            query_key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_queries]. Only used in `self_attn` layer.
                Defaults to None.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_keys]. Default: None.
        Returns:
            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
        """

        norm_index = 0
        attn_index = 0
        ffn_index = 0
        identity = query
        if attn_masks is None:
            attn_masks = [None for _ in range(self.num_attn)]
        elif isinstance(attn_masks, torch.Tensor):
            attn_masks = [
                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
            ]
            warnings.warn(f'Use same attn_mask in all attentions in '
                          f'{self.__class__.__name__} ')
        else:
            assert len(attn_masks) == self.num_attn, f'The length of ' \
                f'attn_masks {len(attn_masks)} must be equal ' \
                f'to the number of attention in ' \
                f'operation_order {self.num_attn}'

        for layer in self.operation_order:
            if layer == 'self_attn':
                temp_key = temp_value = query
                query = self.attentions[attn_index](
                    query,
                    temp_key,
                    temp_value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=query_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=query_key_padding_mask,
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'norm':
                query = self.norms[norm_index](query)
                norm_index += 1

            elif layer == 'cross_attn':
                query = self.attentions[attn_index](
                    query,
                    key,
                    value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=key_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=key_padding_mask,
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'ffn':
                query = self.ffns[ffn_index](
                    query, identity if self.pre_norm else None)
                ffn_index += 1

        return query


================================================
FILE: plugin/models/backbones/bevformer/encoder.py
================================================
"""
Borrowed from StreamMapNet, and add BEV memory fusion
"""

from .custom_base_transformer_layer import MyCustomBaseTransformerLayer
from .temporal_net import TemporalNet
import copy
import warnings
from mmcv.cnn.bricks.registry import (ATTENTION,
                                      TRANSFORMER_LAYER,
                                      TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import TransformerLayerSequence
from mmcv.runner import force_fp32, auto_fp16
import numpy as np
import torch
import torch.nn as nn
from mmcv.utils import TORCH_VERSION, digit_version
from mmcv.utils import ext_loader

from einops import rearrange

ext_module = ext_loader.load_ext(
    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])


@TRANSFORMER_LAYER_SEQUENCE.register_module()
class BEVFormerEncoder(TransformerLayerSequence):

    """
    Attention with both self and cross
    Implements the decoder in DETR transformer.
    Args:
        return_intermediate (bool): Whether to return intermediate outputs.
        coder_norm_cfg (dict): Config of last normalization layer. Default：
            `LN`.
    """

    def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes',
                 **kwargs):

        super(BEVFormerEncoder, self).__init__(*args, **kwargs)
        self.return_intermediate = return_intermediate

        temporal_mem_layers = []
        for _ in range(self.num_layers):
            mem_conv = TemporalNet(history_steps=4, hidden_dims=self.embed_dims, num_blocks=1)
            temporal_mem_layers.append(mem_conv)
        self.temporal_mem_layers = nn.ModuleList(temporal_mem_layers)

        self.num_points_in_pillar = num_points_in_pillar
        self.pc_range = pc_range
        self.fp16_enabled = False

    @staticmethod
    def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float):
        """Get the reference points used in SCA and TSA.
        Args:
            H, W: spatial shape of bev.
            Z: hight of pillar.
            D: sample D points uniformly from each pillar.
            device (obj:`device`): The device where
                reference_points should be.
        Returns:
            Tensor: reference points used in decoder, has \
                shape (bs, num_keys, num_levels, 2).
        """

        # reference points in 3D space, used in spatial cross-attention (SCA)
        if dim == '3d':
            zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype,
                                device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z
            xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype,
                                device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W
            # ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype,
            #                     device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H
            # change y-axis direction
            ys = torch.linspace(H - 0.5, 0.5, H, dtype=dtype,
                                device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H
            ref_3d = torch.stack((xs, ys, zs), -1)
            ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1)
            ref_3d = ref_3d[None].repeat(bs, 1, 1, 1)
            return ref_3d

        # reference points on 2D bev plane, used in temporal self-attention (TSA).
        elif dim == '2d':
            ref_y, ref_x = torch.meshgrid(
                # torch.linspace(
                #     0.5, H - 0.5, H, dtype=dtype, device=device),
                torch.linspace(
                    H - 0.5, 0.5, H, dtype=dtype, device=device),
                torch.linspace(
                    0.5, W - 0.5, W, dtype=dtype, device=device)
            )
            ref_y = ref_y.reshape(-1)[None] / H
            ref_x = ref_x.reshape(-1)[None] / W
            ref_2d = torch.stack((ref_x, ref_y), -1)
            ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2)
            return ref_2d

    # This function must use fp32!!!
    @force_fp32(apply_to=('reference_points', 'img_metas'))
    def point_sampling(self, reference_points, pc_range, img_metas):

        ego2img = []
        for img_meta in img_metas:
            ego2img.append(img_meta['ego2img'])
        ego2img = np.asarray(ego2img)
        ego2img = reference_points.new_tensor(ego2img)  # (B, N, 4, 4)
        reference_points = reference_points.clone()

        reference_points[..., 0:1] = reference_points[..., 0:1] * \
            (pc_range[3] - pc_range[0]) + pc_range[0]
        reference_points[..., 1:2] = reference_points[..., 1:2] * \
            (pc_range[4] - pc_range[1]) + pc_range[1]
        reference_points[..., 2:3] = reference_points[..., 2:3] * \
            (pc_range[5] - pc_range[2]) + pc_range[2]

        reference_points = torch.cat(
            (reference_points, torch.ones_like(reference_points[..., :1])), -1)

        reference_points = reference_points.permute(1, 0, 2, 3)
        D, B, num_query = reference_points.size()[:3]
        num_cam = ego2img.size(1)

        reference_points = reference_points.view(
            D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1)

        ego2img = ego2img.view(
            1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1)

        reference_points_cam = torch.matmul(ego2img.to(torch.float32),
                                            reference_points.to(torch.float32)).squeeze(-1)
        eps = 1e-5

        bev_mask = (reference_points_cam[..., 2:3] > eps)
        reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum(
            reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps)

        reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1]
        reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0]

        bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0)
                    & (reference_points_cam[..., 1:2] < 1.0)
                    & (reference_points_cam[..., 0:1] < 1.0)
                    & (reference_points_cam[..., 0:1] > 0.0))
        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
            bev_mask = torch.nan_to_num(bev_mask)
        else:
            bev_mask = bev_mask.new_tensor(
                np.nan_to_num(bev_mask.cpu().numpy()))

        reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4)
        bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1)

        return reference_points_cam, bev_mask

    @auto_fp16()
    def forward(self,
                bev_query,
                key,
                value,
                *args,
                bev_h=None,
                bev_w=None,
                bev_pos=None,
                spatial_shapes=None,
                level_start_index=None,
                prev_bev=None,
                shift=0.,
                warped_history_bev=None,
                **kwargs):
        """Forward function for `TransformerDecoder`.
        Args:
            bev_query (Tensor): Input BEV query with shape
                `(num_query, bs, embed_dims)`.
            key & value (Tensor): Input multi-cameta features with shape
                (num_cam, num_value, bs, embed_dims)
            reference_points (Tensor): The reference
                points of offset. has shape
                (bs, num_query, 4) when as_two_stage,
                otherwise has shape ((bs, num_query, 2).
            valid_ratios (Tensor): The radios of valid
                points on the feature map, has shape
                (bs, num_levels, 2)
        Returns:
            Tensor: Results with shape [1, num_query, bs, embed_dims] when
                return_intermediate is `False`, otherwise it has shape
                [num_layers, num_query, bs, embed_dims].
        """

        output = bev_query
        intermediate = []

        ref_3d = self.get_reference_points(
            bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1),  device=bev_query.device, dtype=bev_query.dtype)
        ref_2d = self.get_reference_points(
            bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype)

        reference_points_cam, bev_mask = self.point_sampling(
            ref_3d, self.pc_range, kwargs['img_metas'])

        # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper.
        # shift_ref_2d = ref_2d  # .clone()
        shift_ref_2d = ref_2d.clone()
        shift_ref_2d += shift[:, None, None, :]

        # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims)
        bev_query = bev_query.permute(1, 0, 2)
        bev_pos = bev_pos.permute(1, 0, 2)
        bs, len_bev, num_bev_level, _ = ref_2d.shape

        if prev_bev is not None:
            prev_bev = prev_bev.permute(1, 0, 2)
            prev_bev = torch.stack(
                [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1)
            hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape(
                bs*2, len_bev, num_bev_level, 2)
        else:
            hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape(
                bs*2, len_bev, num_bev_level, 2)
        
        for lid, layer in enumerate(self.layers):
            output = layer(
                bev_query,
                key,
                value,
                *args,
                bev_pos=bev_pos,
                ref_2d=hybird_ref_2d,
                ref_3d=ref_3d,
                bev_h=bev_h,
                bev_w=bev_w,
                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index,
                reference_points_cam=reference_points_cam,
                bev_mask=bev_mask,
                prev_bev=prev_bev,
                warped_history_bev=warped_history_bev,
                **kwargs)
            
            # BEV memory fusion layer
            mem_layer = self.temporal_mem_layers[lid]
            curr_feat = rearrange(output, 'b (h w) c -> b c h w', h=warped_history_bev.shape[3])
            fused_output = mem_layer(warped_history_bev, curr_feat)
            fused_output = rearrange(fused_output, 'b c h w -> b (h w) c')
            output = output + fused_output

            bev_query = output
            if self.return_intermediate:
                intermediate.append(output)

        if self.return_intermediate:
            return torch.stack(intermediate)

        return output


@TRANSFORMER_LAYER.register_module()
class BEVFormerLayer(MyCustomBaseTransformerLayer):
    """Implements decoder layer in DETR transformer.
    Args:
        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
            Configs for self_attention or cross_attention, the order
            should be consistent with it in `operation_order`. If it is
            a dict, it would be expand to the number of attention in
            `operation_order`.
        feedforward_channels (int): The hidden dimension for FFNs.
        ffn_dropout (float): Probability of an element to be zeroed
            in ffn. Default 0.0.
        operation_order (tuple[str]): The execution order of operation
            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
            Default：None
        act_cfg (dict): The activation config for FFNs. Default: `LN`
        norm_cfg (dict): Config dict for normalization layer.
            Default: `LN`.
        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
            Default：2.
    """

    def __init__(self,
                 attn_cfgs,
                 feedforward_channels,
                 ffn_dropout=0.0,
                 operation_order=None,
                 act_cfg=dict(type='ReLU', inplace=True),
                 norm_cfg=dict(type='LN'),
                 ffn_num_fcs=2,
                 **kwargs):
        super(BEVFormerLayer, self).__init__(
            attn_cfgs=attn_cfgs,
            feedforward_channels=feedforward_channels,
            ffn_dropout=ffn_dropout,
            operation_order=operation_order,
            act_cfg=act_cfg,
            norm_cfg=norm_cfg,
            ffn_num_fcs=ffn_num_fcs,
            **kwargs)
        self.fp16_enabled = False
        assert len(operation_order) == 6
        assert set(operation_order) == set(
            ['self_attn', 'norm', 'cross_attn', 'ffn'])

    def forward(self,
                query,
                key=None,
                value=None,
                bev_pos=None,
                query_pos=None,
                key_pos=None,
                attn_masks=None,
                query_key_padding_mask=None,
                key_padding_mask=None,
                ref_2d=None,
                ref_3d=None,
                bev_h=None,
                bev_w=None,
                reference_points_cam=None,
                mask=None,
                spatial_shapes=None,
                level_start_index=None,
                prev_bev=None,
                **kwargs):
        """Forward function for `TransformerDecoderLayer`.

        **kwargs contains some specific arguments of attentions.

        Args:
            query (Tensor): The input query with shape
                [num_queries, bs, embed_dims] if
                self.batch_first is False, else
                [bs, num_queries embed_dims].
            key (Tensor): The key tensor with shape [num_keys, bs,
                embed_dims] if self.batch_first is False, else
                [bs, num_keys, embed_dims] .
            value (Tensor): The value tensor with same shape as `key`.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`.
                Default: None.
            attn_masks (List[Tensor] | None): 2D Tensor used in
                calculation of corresponding attention. The length of
                it should equal to the number of `attention` in
                `operation_order`. Default: None.
            query_key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_queries]. Only used in `self_attn` layer.
                Defaults to None.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_keys]. Default: None.

        Returns:
            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
        """

        norm_index = 0
        attn_index = 0
        ffn_index = 0
        identity = query
        if attn_masks is None:
            attn_masks = [None for _ in range(self.num_attn)]
        elif isinstance(attn_masks, torch.Tensor):
            attn_masks = [
                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
            ]
            warnings.warn(f'Use same attn_mask in all attentions in '
                          f'{self.__class__.__name__} ')
        else:
            assert len(attn_masks) == self.num_attn, f'The length of ' \
                                                     f'attn_masks {len(attn_masks)} must be equal ' \
                                                     f'to the number of attention in ' \
                f'operation_order {self.num_attn}'

        for layer in self.operation_order:
            # temporal self attention
            if layer == 'self_attn':
                query = self.attentions[attn_index](
                    query,
                    prev_bev,
                    prev_bev,
                    identity if self.pre_norm else None,
                    query_pos=bev_pos,
                    key_pos=bev_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=query_key_padding_mask,
                    reference_points=ref_2d,
                    spatial_shapes=torch.tensor(
                        [[bev_h, bev_w]], device=query.device),
                    level_start_index=torch.tensor([0], device=query.device),
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'norm':
                query = self.norms[norm_index](query)
                norm_index += 1

            # spaital cross attention
            elif layer == 'cross_attn':
                query = self.attentions[attn_index](
                    query,
                    key,
                    value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=key_pos,
                    reference_points=ref_3d,
                    reference_points_cam=reference_points_cam,
                    mask=mask,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=key_padding_mask,
                    spatial_shapes=spatial_shapes,
                    level_start_index=level_start_index,
                    **kwargs)
                attn_index += 1
                identity = query
            elif layer == 'ffn':
                query = self.ffns[ffn_index](
                    query, identity if self.pre_norm else None)
                ffn_index += 1

        return query


================================================
FILE: plugin/models/backbones/bevformer/grid_mask.py
================================================
import torch
import torch.nn as nn
import numpy as np
from PIL import Image
from mmcv.runner import force_fp32, auto_fp16

class Grid(object):
    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
        self.use_h = use_h
        self.use_w = use_w
        self.rotate = rotate
        self.offset = offset
        self.ratio = ratio
        self.mode=mode
        self.st_prob = prob
        self.prob = prob

    def set_prob(self, epoch, max_epoch):
        self.prob = self.st_prob * epoch / max_epoch

    def __call__(self, img, label):
        if np.random.rand() > self.prob:
            return img, label
        h = img.size(1)
        w = img.size(2)
        self.d1 = 2
        self.d2 = min(h, w)
        hh = int(1.5*h)
        ww = int(1.5*w)
        d = np.random.randint(self.d1, self.d2)
        if self.ratio == 1:
            self.l = np.random.randint(1, d)
        else:
            self.l = min(max(int(d*self.ratio+0.5),1),d-1)
        mask = np.ones((hh, ww), np.float32)
        st_h = np.random.randint(d)
        st_w = np.random.randint(d)
        if self.use_h:
            for i in range(hh//d):
                s = d*i + st_h
                t = min(s+self.l, hh)
                mask[s:t,:] *= 0
        if self.use_w:
            for i in range(ww//d):
                s = d*i + st_w
                t = min(s+self.l, ww)
                mask[:,s:t] *= 0
       
        r = np.random.randint(self.rotate)
        mask = Image.fromarray(np.uint8(mask))
        mask = mask.rotate(r)
        mask = np.asarray(mask)
        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]

        mask = torch.from_numpy(mask).float()
        if self.mode == 1:
            mask = 1-mask

        mask = mask.expand_as(img)
        if self.offset:
            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float()
            offset = (1 - mask) * offset
            img = img * mask + offset
        else:
            img = img * mask 

        return img, label


class GridMask(nn.Module):
    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
        super(GridMask, self).__init__()
        self.use_h = use_h
        self.use_w = use_w
        self.rotate = rotate
        self.offset = offset
        self.ratio = ratio
        self.mode = mode
        self.st_prob = prob
        self.prob = prob
        self.fp16_enable = False

    def set_prob(self, epoch, max_epoch):
        self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5
    
    def set_ratio_and_prob(self, ratio, prob):
        self.prob = prob
        self.ratio = ratio

    @auto_fp16()
    def forward(self, x):
        if np.random.rand() > self.prob or not self.training:
            return x
        n,c,h,w = x.size()
        x = x.view(-1,h,w)
        hh = int(1.5*h)
        ww = int(1.5*w)
        d = np.random.randint(2, h)
        self.l = min(max(int(d*self.ratio+0.5),1),d-1)
        mask = np.ones((hh, ww), np.float32)
        st_h = np.random.randint(d)
        st_w = np.random.randint(d)
        if self.use_h:
            for i in range(hh//d):
                s = d*i + st_h
                t = min(s+self.l, hh)
                mask[s:t,:] *= 0
        if self.use_w:
            for i in range(ww//d):
                s = d*i + st_w
                t = min(s+self.l, ww)
                mask[:,s:t] *= 0
       
        r = np.random.randint(self.rotate)
        mask = Image.fromarray(np.uint8(mask))
        mask = mask.rotate(r)
        mask = np.asarray(mask)
        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]

        mask = torch.from_numpy(mask).to(x.dtype).cuda()

        if self.mode == 1:
            mask = 1-mask

        mask = mask.expand_as(x)
        if self.offset:
            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda()
            x = x * mask + offset * (1 - mask)
        else:
            x = x * mask 
        
        return x.view(n,c,h,w)

================================================
FILE: plugin/models/backbones/bevformer/multi_scale_deformable_attn_function.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------

import torch
from torch.cuda.amp import custom_bwd, custom_fwd
from torch.autograd.function import Function, once_differentiable
from mmcv.utils import ext_loader
ext_module = ext_loader.load_ext(
    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])


class MultiScaleDeformableAttnFunction_fp16(Function):

    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
                sampling_locations, attention_weights, im2col_step):
        """GPU version of multi-scale deformable attention.

        Args:
            value (Tensor): The value has shape
                (bs, num_keys, mum_heads, embed_dims//num_heads)
            value_spatial_shapes (Tensor): Spatial shape of
                each feature map, has shape (num_levels, 2),
                last dimension 2 represent (h, w)
            sampling_locations (Tensor): The location of sampling points,
                has shape
                (bs ,num_queries, num_heads, num_levels, num_points, 2),
                the last dimension 2 represent (x, y).
            attention_weights (Tensor): The weight of sampling points used
                when calculate the attention, has shape
                (bs ,num_queries, num_heads, num_levels, num_points),
            im2col_step (Tensor): The step used in image to column.

        Returns:
            Tensor: has shape (bs, num_queries, embed_dims)
        """
        ctx.im2col_step = im2col_step
        output = ext_module.ms_deform_attn_forward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            im2col_step=ctx.im2col_step)
        ctx.save_for_backward(value, value_spatial_shapes,
                              value_level_start_index, sampling_locations,
                              attention_weights)
        return output

    @staticmethod
    @once_differentiable
    @custom_bwd
    def backward(ctx, grad_output):
        """GPU version of backward function.

        Args:
            grad_output (Tensor): Gradient
                of output tensor of forward.

        Returns:
             Tuple[Tensor]: Gradient
                of input tensors in forward.
        """
        value, value_spatial_shapes, value_level_start_index, \
            sampling_locations, attention_weights = ctx.saved_tensors
        grad_value = torch.zeros_like(value)
        grad_sampling_loc = torch.zeros_like(sampling_locations)
        grad_attn_weight = torch.zeros_like(attention_weights)

        ext_module.ms_deform_attn_backward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            grad_output.contiguous(),
            grad_value,
            grad_sampling_loc,
            grad_attn_weight,
            im2col_step=ctx.im2col_step)

        return grad_value, None, None, \
            grad_sampling_loc, grad_attn_weight, None


class MultiScaleDeformableAttnFunction_fp32(Function):

    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
                sampling_locations, attention_weights, im2col_step):
        """GPU version of multi-scale deformable attention.

        Args:
            value (Tensor): The value has shape
                (bs, num_keys, mum_heads, embed_dims//num_heads)
            value_spatial_shapes (Tensor): Spatial shape of
                each feature map, has shape (num_levels, 2),
                last dimension 2 represent (h, w)
            sampling_locations (Tensor): The location of sampling points,
                has shape
                (bs ,num_queries, num_heads, num_levels, num_points, 2),
                the last dimension 2 represent (x, y).
            attention_weights (Tensor): The weight of sampling points used
                when calculate the attention, has shape
                (bs ,num_queries, num_heads, num_levels, num_points),
            im2col_step (Tensor): The step used in image to column.

        Returns:
            Tensor: has shape (bs, num_queries, embed_dims)
        """

        ctx.im2col_step = im2col_step
        output = ext_module.ms_deform_attn_forward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            im2col_step=ctx.im2col_step)
        ctx.save_for_backward(value, value_spatial_shapes,
                              value_level_start_index, sampling_locations,
                              attention_weights)
        return output

    @staticmethod
    @once_differentiable
    @custom_bwd
    def backward(ctx, grad_output):
        """GPU version of backward function.

        Args:
            grad_output (Tensor): Gradient
                of output tensor of forward.

        Returns:
             Tuple[Tensor]: Gradient
                of input tensors in forward.
        """
        value, value_spatial_shapes, value_level_start_index, \
            sampling_locations, attention_weights = ctx.saved_tensors
        grad_value = torch.zeros_like(value)
        grad_sampling_loc = torch.zeros_like(sampling_locations)
        grad_attn_weight = torch.zeros_like(attention_weights)

        ext_module.ms_deform_attn_backward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            grad_output.contiguous(),
            grad_value,
            grad_sampling_loc,
            grad_attn_weight,
            im2col_step=ctx.im2col_step)

        return grad_value, None, None, \
            grad_sampling_loc, grad_attn_weight, None


================================================
FILE: plugin/models/backbones/bevformer/spatial_cross_attention.py
================================================

# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------

from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import xavier_init, constant_init
from mmcv.cnn.bricks.registry import (ATTENTION,
                                      TRANSFORMER_LAYER,
                                      TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import build_attention
import math
from mmcv.runner import force_fp32, auto_fp16

from mmcv.runner.base_module import BaseModule, ModuleList, Sequential

from mmcv.utils import ext_loader
from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
    MultiScaleDeformableAttnFunction_fp16
ext_module = ext_loader.load_ext(
    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])


@ATTENTION.register_module()
class SpatialCrossAttention(BaseModule):
    """An attention module used in BEVFormer.
    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_cams (int): The number of cameras
        dropout (float): A Dropout layer on `inp_residual`.
            Default: 0..
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        deformable_attention: (dict): The config for the deformable attention used in SCA.
    """

    def __init__(self,
                 embed_dims=256,
                 num_cams=6,
                 pc_range=None,
                 dropout=0.1,
                 init_cfg=None,
                 batch_first=False,
                 deformable_attention=dict(
                     type='MSDeformableAttention3D',
                     embed_dims=256,
                     num_levels=4),
                 **kwargs
                 ):
        super(SpatialCrossAttention, self).__init__(init_cfg)

        self.init_cfg = init_cfg
        self.dropout = nn.Dropout(dropout)
        self.pc_range = pc_range
        self.fp16_enabled = False
        self.deformable_attention = build_attention(deformable_attention)
        self.embed_dims = embed_dims
        self.num_cams = num_cams
        self.output_proj = nn.Linear(embed_dims, embed_dims)
        self.batch_first = batch_first
        self.init_weight()

    def init_weight(self):
        """Default initialization for Parameters of Module."""
        xavier_init(self.output_proj, distribution='uniform', bias=0.)
    
    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
    def forward(self,
                query,
                key,
                value,
                residual=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=None,
                spatial_shapes=None,
                reference_points_cam=None,
                bev_mask=None,
                level_start_index=None,
                flag='encoder',
                **kwargs):
        """Forward Function of Detr3DCrossAtten.
        Args:
            query (Tensor): Query of Transformer with shape
                (num_query, bs, embed_dims).
            key (Tensor): The key tensor with shape
                `(num_key, bs, embed_dims)`.
            value (Tensor): The value tensor with shape
                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
            residual (Tensor): The tensor used for addition, with the
                same shape as `x`. Default None. If None, `x` will be used.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for  `key`. Default
                None.
            reference_points (Tensor):  The normalized reference
                points with shape (bs, num_query, 4),
                all elements is range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area.
                or (N, Length_{query}, num_levels, 4), add
                additional two dimensions is (w, h) to
                form reference boxes.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            spatial_shapes (Tensor): Spatial shape of features in
                different level. With shape  (num_levels, 2),
                last dimension represent (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape (num_levels) and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
        Returns:
             Tensor: forwarded results with shape [num_query, bs, embed_dims].
        """

        if key is None:
            key = query
        if value is None:
            value = key

        if residual is None:
            inp_residual = query
            slots = torch.zeros_like(query)
        if query_pos is not None:
            query = query + query_pos

        bs, num_query, _ = query.size()

        D = reference_points_cam.size(3)
        indexes = []
        for i, mask_per_img in enumerate(bev_mask):
            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
            indexes.append(index_query_per_img)
        max_len = max([len(each) for each in indexes])

        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
        queries_rebatch = query.new_zeros(
            [bs, self.num_cams, max_len, self.embed_dims])
        reference_points_rebatch = reference_points_cam.new_zeros(
            [bs, self.num_cams, max_len, D, 2])
        
        for j in range(bs):
            for i, reference_points_per_img in enumerate(reference_points_cam):   
                index_query_per_img = indexes[i]
                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img]

        num_cams, l, bs, embed_dims = key.shape

        key = key.permute(2, 0, 1, 3).reshape(
            bs * self.num_cams, l, self.embed_dims)
        value = value.permute(2, 0, 1, 3).reshape(
            bs * self.num_cams, l, self.embed_dims)

        queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
                                            reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
        for j in range(bs):
            for i, index_query_per_img in enumerate(indexes):
                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]

        count = bev_mask.sum(-1) > 0
        count = count.permute(1, 2, 0).sum(-1)
        count = torch.clamp(count, min=1.0)
        slots = slots / count[..., None]
        slots = self.output_proj(slots)

        return self.dropout(slots) + inp_residual


@ATTENTION.register_module()
class MSDeformableAttention3D(BaseModule):
    """An attention module used in BEVFormer based on Deformable-Detr.
    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
    <https://arxiv.org/pdf/2010.04159.pdf>`_.
    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_heads (int): Parallel attention heads. Default: 64.
        num_levels (int): The number of feature map used in
            Attention. Default: 4.
        num_points (int): The number of sampling points for
            each query in each head. Default: 4.
        im2col_step (int): The step used in image_to_column.
            Default: 64.
        dropout (float): A Dropout layer on `inp_identity`.
            Default: 0.1.
        batch_first (bool): Key, Query and Value are shape of
            (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
        norm_cfg (dict): Config dict for normalization layer.
            Default: None.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=8,
                 im2col_step=64,
                 dropout=0.1,
                 batch_first=True,
                 norm_cfg=None,
                 init_cfg=None):
        super().__init__(init_cfg)
        if embed_dims % num_heads != 0:
            raise ValueError(f'embed_dims must be divisible by num_heads, '
                             f'but got {embed_dims} and {num_heads}')
        dim_per_head = embed_dims // num_heads
        self.norm_cfg = norm_cfg
        self.batch_first = batch_first
        self.output_proj = None
        self.fp16_enabled = False

        # you'd better set dim_per_head to a power of 2
        # which is more efficient in the CUDA implementation
        def _is_power_of_2(n):
            if (not isinstance(n, int)) or (n < 0):
                raise ValueError(
                    'invalid input for _is_power_of_2: {} (type: {})'.format(
                        n, type(n)))
            return (n & (n - 1) == 0) and n != 0

        if not _is_power_of_2(dim_per_head):
            warnings.warn(
                "You'd better set embed_dims in "
                'MultiScaleDeformAttention to make '
                'the dimension of each attention head a power of 2 '
                'which is more efficient in our CUDA implementation.')

        self.im2col_step = im2col_step
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_heads = num_heads
        self.num_points = num_points
        self.sampling_offsets = nn.Linear(
            embed_dims, num_heads * num_levels * num_points * 2)
        self.attention_weights = nn.Linear(embed_dims,
                                           num_heads * num_levels * num_points)
        self.value_proj = nn.Linear(embed_dims, embed_dims)

        self.init_weights()

    def init_weights(self):
        """Default initialization for Parameters of Module."""
        constant_init(self.sampling_offsets, 0.)
        thetas = torch.arange(
            self.num_heads,
            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (grid_init /
                     grid_init.abs().max(-1, keepdim=True)[0]).view(
            self.num_heads, 1, 1,
            2).repeat(1, self.num_levels, self.num_points, 1)
        for i in range(self.num_points):
            grid_init[:, :, i, :] *= i + 1

        self.sampling_offsets.bias.data = grid_init.view(-1)
        constant_init(self.attention_weights, val=0., bias=0.)
        xavier_init(self.value_proj, distribution='uniform', bias=0.)
        xavier_init(self.output_proj, distribution='uniform', bias=0.)
        self._is_init = True

    def forward(self,
                query,
                key=None,
                value=None,
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=None,
                spatial_shapes=None,
                level_start_index=None,
                **kwargs):
        """Forward Function of MultiScaleDeformAttention.
        Args:
            query (Tensor): Query of Transformer with shape
                ( bs, num_query, embed_dims).
            key (Tensor): The key tensor with shape
                `(bs, num_key,  embed_dims)`.
            value (Tensor): The value tensor with shape
                `(bs, num_key,  embed_dims)`.
            identity (Tensor): The tensor used for addition, with the
                same shape as `query`. Default None. If None,
                `query` will be used.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`. Default
                None.
            reference_points (Tensor):  The normalized reference
                points with shape (bs, num_query, num_levels, 2),
                all elements is range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area.
                or (N, Length_{query}, num_levels, 4), add
                additional two dimensions is (w, h) to
                form reference boxes.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            spatial_shapes (Tensor): Spatial shape of features in
                different levels. With shape (num_levels, 2),
                last dimension represents (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape ``(num_levels, )`` and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
        Returns:
             Tensor: forwarded results with shape [num_query, bs, embed_dims].
        """

        if value is None:
            value = query
        if identity is None:
            identity = query
        if query_pos is not None:
            query = query + query_pos

        if not self.batch_first:
            # change to (bs, num_query ,embed_dims)
            query = query.permute(1, 0, 2)
            value = value.permute(1, 0, 2)

        bs, num_query, _ = query.shape
        bs, num_value, _ = value.shape
        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value

        value = self.value_proj(value)
        if key_padding_mask is not None:
            value = value.masked_fill(key_padding_mask[..., None], 0.0)
        value = value.view(bs, num_value, self.num_heads, -1)
        sampling_offsets = self.sampling_offsets(query).view(
            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
        attention_weights = self.attention_weights(query).view(
            bs, num_query, self.num_heads, self.num_levels * self.num_points)

        attention_weights = attention_weights.softmax(-1)

        attention_weights = attention_weights.view(bs, num_query,
                                                   self.num_heads,
                                                   self.num_levels,
                                                   self.num_points)

        if reference_points.shape[-1] == 2:
            """
            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
            For each referent point, we sample `num_points` sampling points.
            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
            """
            offset_normalizer = torch.stack(
                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)

            bs, num_query, num_Z_anchors, xy = reference_points.shape
            reference_points = reference_points[:, :, None, None, None, :, :]
            sampling_offsets = sampling_offsets / \
                offset_normalizer[None, None, None, :, None, :]
            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
            sampling_offsets = sampling_offsets.view(
                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
            sampling_locations = reference_points + sampling_offsets
            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
            assert num_all_points == num_points * num_Z_anchors

            sampling_locations = sampling_locations.view(
                bs, num_query, num_heads, num_levels, num_all_points, xy)

        elif reference_points.shape[-1] == 4:
            assert False
        else:
            raise ValueError(
                f'Last dim of reference_points must be'
                f' 2 or 4, but get {reference_points.shape[-1]} instead.')

        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
        #

        if torch.cuda.is_available() and value.is_cuda:
            if value.dtype == torch.float16:
                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
            else:
                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
            output = MultiScaleDeformableAttnFunction.apply(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)
        else:
            output = multi_scale_deformable_attn_pytorch(
                value, spatial_shapes, sampling_locations, attention_weights)
        if not self.batch_first:
            output = output.permute(1, 0, 2)

        return output


@ATTENTION.register_module()
class MSIPM3D(BaseModule):
    """An attention module used in BEVFormer based on Deformable-Detr.
    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
    <https://arxiv.org/pdf/2010.04159.pdf>`_.
    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_heads (int): Parallel attention heads. Default: 64.
        num_levels (int): The number of feature map used in
            Attention. Default: 4.
        num_points (int): The number of sampling points for
            each query in each head. Default: 4.
        im2col_step (int): The step used in image_to_column.
            Default: 64.
        dropout (float): A Dropout layer on `inp_identity`.
            Default: 0.1.
        batch_first (bool): Key, Query and Value are shape of
            (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
        norm_cfg (dict): Config dict for normalization layer.
            Default: None.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=8,
                 im2col_step=64,
                 dropout=0.1,
                 batch_first=True,
                 norm_cfg=None,
                 init_cfg=None):
        super().__init__(init_cfg)
        if embed_dims % num_heads != 0:
            raise ValueError(f'embed_dims must be divisible by num_heads, '
                             f'but got {embed_dims} and {num_heads}')
        dim_per_head = embed_dims // num_heads
        self.norm_cfg = norm_cfg
        self.batch_first = batch_first
        self.output_proj = None
        self.fp16_enabled = False

        # you'd better set dim_per_head to a power of 2
        # which is more efficient in the CUDA implementation
        def _is_power_of_2(n):
            if (not isinstance(n, int)) or (n < 0):
                raise ValueError(
                    'invalid input for _is_power_of_2: {} (type: {})'.format(
                        n, type(n)))
            return (n & (n - 1) == 0) and n != 0

        if not _is_power_of_2(dim_per_head):
            warnings.warn(
                "You'd better set embed_dims in "
                'MultiScaleDeformAttention to make '
                'the dimension of each attention head a power of 2 '
                'which is more efficient in our CUDA implementation.')

        self.im2col_step = im2col_step
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_heads = num_heads
        self.num_points = num_points
        # self.sampling_offsets = nn.Linear(
        #     embed_dims, num_heads * num_levels * num_points * 2)
        # self.attention_weights = nn.Linear(embed_dims,
        #                                    num_heads * num_levels * num_points)
        self.value_proj = nn.Linear(embed_dims, embed_dims)

        self.init_weights()

    def init_weights(self):
        """Default initialization for Parameters of Module."""
        # constant_init(self.sampling_offsets, 0.)
        thetas = torch.arange(
            self.num_heads,
            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (grid_init /
                     grid_init.abs().max(-1, keepdim=True)[0]).view(
            self.num_heads, 1, 1,
            2).repeat(1, self.num_levels, self.num_points, 1)
        for i in range(self.num_points):
            grid_init[:, :, i, :] *= i + 1

        # self.sampling_offsets.bias.data = grid_init.view(-1)
        self.fixed_sampling_offsets = nn.Parameter(grid_init.view(-1), requires_grad=False)
        # constant_init(self.attention_weights, val=0., bias=0.)
        xavier_init(self.value_proj, distribution='uniform', bias=0.)
        xavier_init(self.output_proj, distribution='uniform', bias=0.)
        self._is_init = True

    def forward(self,
                query,
                key=None,
                value=None,
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=None,
                spatial_shapes=None,
                level_start_index=None,
                **kwargs):
        """Forward Function of MultiScaleDeformAttention.
        Args:
            query (Tensor): Query of Transformer with shape
                ( bs, num_query, embed_dims).
            key (Tensor): The key tensor with shape
                `(bs, num_key,  embed_dims)`.
            value (Tensor): The value tensor with shape
                `(bs, num_key,  embed_dims)`.
            identity (Tensor): The tensor used for addition, with the
                same shape as `query`. Default None. If None,
                `query` will be used.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`. Default
                None.
            reference_points (Tensor):  The normalized reference
                points with shape (bs, num_query, num_levels, 2),
                all elements is range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area.
                or (N, Length_{query}, num_levels, 4), add
                additional two dimensions is (w, h) to
                form reference boxes.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            spatial_shapes (Tensor): Spatial shape of features in
                different levels. With shape (num_levels, 2),
                last dimension represents (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape ``(num_levels, )`` and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
        Returns:
             Tensor: forwarded results with shape [num_query, bs, embed_dims].
        """

        if value is None:
            value = query
        if identity is None:
            identity = query
        if query_pos is not None:
            query = query + query_pos

        if not self.batch_first:
            # change to (bs, num_query ,embed_dims)
            query = query.permute(1, 0, 2)
            value = value.permute(1, 0, 2)

        bs, num_query, _ = query.shape
        bs, num_value, _ = value.shape
        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value

        value = self.value_proj(value)
        if key_padding_mask is not None:
            value = value.masked_fill(key_padding_mask[..., None], 0.0)
        value = value.view(bs, num_value, self.num_heads, -1)
        sampling_offsets = self.fixed_sampling_offsets.view(
            1, 1, self.num_heads, self.num_levels, self.num_points, 2).repeat(
            bs, num_query, 1, 1, 1,1)
        # attention_weights = self.attention_weights(query).view(
        #     bs, num_query, self.num_heads, self.num_levels * self.num_points)
        attention_weights = query.new_ones((bs, num_query, self.num_heads, self.num_levels * self.num_points))
        attention_weights = attention_weights.softmax(-1)
        # import pdb;pdb.set_trace()
        attention_weights = attention_weights.view(bs, num_query,
                                                   self.num_heads,
                                                   self.num_levels,
                                                   self.num_points)

        if reference_points.shape[-1] == 2:
            """
            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
            For each referent point, we sample `num_points` sampling points.
            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
            """
            offset_normalizer = torch.stack(
                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)

            bs, num_query, num_Z_anchors, xy = reference_points.shape
            reference_points = reference_points[:, :, None, None, None, :, :]
            sampling_offsets = sampling_offsets / \
                offset_normalizer[None, None, None, :, None, :]
            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
            sampling_offsets = sampling_offsets.view(
                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
            sampling_locations = reference_points + sampling_offsets
            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
            assert num_all_points == num_points * num_Z_anchors

            sampling_locations = sampling_locations.view(
                bs, num_query, num_heads, num_levels, num_all_points, xy)

        elif reference_points.shape[-1] == 4:
            assert False
        else:
            raise ValueError(
                f'Last dim of reference_points must be'
                f' 2 or 4, but get {reference_points.shape[-1]} instead.')

        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
        #

        if torch.cuda.is_available() and value.is_cuda:
            if value.dtype == torch.float16:
                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
            else:
                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
            output = MultiScaleDeformableAttnFunction.apply(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)
        else:
            output = multi_scale_deformable_attn_pytorch(
                value, spatial_shapes, sampling_locations, attention_weights)
        if not self.batch_first:
            output = output.permute(1, 0, 2)

        return output


================================================
FILE: plugin/models/backbones/bevformer/temporal_net.py
================================================
import torch
import torch.nn as nn
from typing import Optional, Sequence, Tuple, Union
from mmdet.models import NECKS
from mmcv.cnn.utils import kaiming_init, constant_init
from mmcv.cnn.resnet import conv3x3
from torch import Tensor

from einops import rearrange


class MyResBlock(nn.Module):
    def __init__(self,
                 inplanes: int,
                 planes: int,
                 stride: int = 1,
                 dilation: int = 1,
                 style: str = 'pytorch',
                 with_cp: bool = False):
        super().__init__()
        assert style in ['pytorch', 'caffe']
        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.stride = stride
        self.dilation = dilation
        assert not with_cp

    def forward(self, x: Tensor) -> Tensor:
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        out += residual
        out = self.relu(out)

        return out


@NECKS.register_module()
class TemporalNet(nn.Module):
    def __init__(self, history_steps, hidden_dims, num_blocks):
        super(TemporalNet, self).__init__()
        self.history_steps = history_steps
        self.hidden_dims = hidden_dims
        self.num_blocks = num_blocks
        
        layers = []
        
        in_dims = (history_steps+1) * hidden_dims
        self.conv_in = conv3x3(in_dims, hidden_dims, 1, 1)
        self.bn = nn.BatchNorm2d(hidden_dims)
        self.relu = nn.ReLU(inplace=True)        

        for _ in range(self.num_blocks):
            layers.append(MyResBlock(hidden_dims, hidden_dims))
        self.res_layer = nn.Sequential(*layers) 
    

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                kaiming_init(m)
    

    def forward(self, history_feats, curr_feat):
        input_feats = torch.cat([history_feats, curr_feat.unsqueeze(1)], dim=1)
        input_feats = rearrange(input_feats, 'b t c h w -> b (t c) h w') 

        out = self.conv_in(input_feats)
        out = self.bn(out)
        out = self.relu(out)
        out = self.res_layer(out)
        if curr_feat.dim() == 3:
            out = out.squeeze(0)

        return out


================================================
FILE: plugin/models/backbones/bevformer/temporal_self_attention.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------

from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
import warnings
import torch
import torch.nn as nn
from mmcv.cnn import xavier_init, constant_init
from mmcv.cnn.bricks.registry import ATTENTION
import math
from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
                        to_2tuple)

from mmcv.utils import ext_loader
ext_module = ext_loader.load_ext(
    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])


@ATTENTION.register_module()
class TemporalSelfAttention(BaseModule):
    """An attention module used in BEVFormer based on Deformable-Detr.

    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
    <https://arxiv.org/pdf/2010.04159.pdf>`_.

    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_heads (int): Parallel attention heads. Default: 64.
        num_levels (int): The number of feature map used in
            Attention. Default: 4.
        num_points (int): The number of sampling points for
            each query in each head. Default: 4.
        im2col_step (int): The step used in image_to_column.
            Default: 64.
        dropout (float): A Dropout layer on `inp_identity`.
            Default: 0.1.
        batch_first (bool): Key, Query and Value are shape of
            (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to True.
        norm_cfg (dict): Config dict for normalization layer.
            Default: None.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
         the length of BEV queue is 2.
    """

    def __init__(self,
                 embed_dims=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=4,
                 num_bev_queue=2,
                 im2col_step=64,
                 dropout=0.1,
                 batch_first=True,
                 norm_cfg=None,
                 init_cfg=None):

        super().__init__(init_cfg)
        if embed_dims % num_heads != 0:
            raise ValueError(f'embed_dims must be divisible by num_heads, '
                             f'but got {embed_dims} and {num_heads}')
        dim_per_head = embed_dims // num_heads
        self.norm_cfg = norm_cfg
        self.dropout = nn.Dropout(dropout)
        self.batch_first = batch_first
        self.fp16_enabled = False

        # you'd better set dim_per_head to a power of 2
        # which is more efficient in the CUDA implementation
        def _is_power_of_2(n):
            if (not isinstance(n, int)) or (n < 0):
                raise ValueError(
                    'invalid input for _is_power_of_2: {} (type: {})'.format(
                        n, type(n)))
            return (n & (n - 1) == 0) and n != 0

        if not _is_power_of_2(dim_per_head):
            warnings.warn(
                "You'd better set embed_dims in "
                'MultiScaleDeformAttention to make '
                'the dimension of each attention head a power of 2 '
                'which is more efficient in our CUDA implementation.')

        self.im2col_step = im2col_step
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_heads = num_heads
        self.num_points = num_points
        self.num_bev_queue = num_bev_queue
        self.sampling_offsets = nn.Linear(
            embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
        self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue,
                                           num_bev_queue*num_heads * num_levels * num_points)
        self.value_proj = nn.Linear(embed_dims, embed_dims)
        self.output_proj = nn.Linear(embed_dims, embed_dims)
        self.init_weights()

    def init_weights(self):
        """Default initialization for Parameters of Module."""
        constant_init(self.sampling_offsets, 0.)
        thetas = torch.arange(
            self.num_heads,
            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (grid_init /
                     grid_init.abs().max(-1, keepdim=True)[0]).view(
            self.num_heads, 1, 1,
            2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1)

        for i in range(self.num_points):
            grid_init[:, :, i, :] *= i + 1

        self.sampling_offsets.bias.data = grid_init.view(-1)
        constant_init(self.attention_weights, val=0., bias=0.)
        xavier_init(self.value_proj, distribution='uniform', bias=0.)
        xavier_init(self.output_proj, distribution='uniform', bias=0.)
        self._is_init = True

    def forward(self,
                query,
                key=None,
                value=None,
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=None,
                spatial_shapes=None,
                level_start_index=None,
                flag='decoder',

                **kwargs):
        """Forward Function of MultiScaleDeformAttention.

        Args:
            query (Tensor): Query of Transformer with shape
                (num_query, bs, embed_dims).
            key (Tensor): The key tensor with shape
                `(num_key, bs, embed_dims)`.
            value (Tensor): The value tensor with shape
                `(num_key, bs, embed_dims)`.
            identity (Tensor): The tensor used for addition, with the
                same shape as `query`. Default None. If None,
                `query` will be used.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`. Default
                None.
            reference_points (Tensor):  The normalized reference
                points with shape (bs, num_query, num_levels, 2),
                all elements is range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area.
                or (N, Length_{query}, num_levels, 4), add
                additional two dimensions is (w, h) to
                form reference boxes.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            spatial_shapes (Tensor): Spatial shape of features in
                different levels. With shape (num_levels, 2),
                last dimension represents (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape ``(num_levels, )`` and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].

        Returns:
             Tensor: forwarded results with shape [num_query, bs, embed_dims].
        """

        if value is None:
            assert self.batch_first
            bs, len_bev, c = query.shape
            value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c)

            # value = torch.cat([query, query], 0)

        if identity is None:
            identity = query
        if query_pos is not None:
            query = query + query_pos
        if not self.batch_first:
            # change to (bs, num_query ,embed_dims)
            query = query.permute(1, 0, 2)
            value = value.permute(1, 0, 2)
        bs,  num_query, embed_dims = query.shape
        _, num_value, _ = value.shape
        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
        assert self.num_bev_queue == 2

        query = torch.cat([value[:bs], query], -1)
        value = self.value_proj(value)

        if key_padding_mask is not None:
            value = value.masked_fill(key_padding_mask[..., None], 0.0)

        value = value.reshape(bs*self.num_bev_queue,
                              num_value, self.num_heads, -1)

        sampling_offsets = self.sampling_offsets(query)
        sampling_offsets = sampling_offsets.view(
            bs, num_query, self.num_heads,  self.num_bev_queue, self.num_levels, self.num_points, 2)
        attention_weights = self.attention_weights(query).view(
            bs, num_query,  self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
        attention_weights = attention_weights.softmax(-1)

        attention_weights = attention_weights.view(bs, num_query,
                                                   self.num_heads,
                                                   self.num_bev_queue,
                                                   self.num_levels,
                                                   self.num_points)

        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\
            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\
            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)

        if reference_points.shape[-1] == 2:
            offset_normalizer = torch.stack(
                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
            sampling_locations = reference_points[:, :, None, :, None, :] \
                + sampling_offsets \
                / offset_normalizer[None, None, None, :, None, :]

        elif reference_points.shape[-1] == 4:
            sampling_locations = reference_points[:, :, None, :, None, :2] \
                + sampling_offsets / self.num_points \
                * reference_points[:, :, None, :, None, 2:] \
                * 0.5
        else:
            raise ValueError(
                f'Last dim of reference_points must be'
                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
        if torch.cuda.is_available() and value.is_cuda:

            # using fp16 deformable attention is unstable because it performs many sum operations
            if value.dtype == torch.float16:
                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
            else:
                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
            output = MultiScaleDeformableAttnFunction.apply(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)
        else:

            output = multi_scale_deformable_attn_pytorch(
                value, spatial_shapes, sampling_locations, attention_weights)

        # output shape (bs*num_bev_queue, num_query, embed_dims)
        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
        output = output.permute(1, 2, 0)

        # fuse history value and current value
        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
        output = output.mean(-1)

        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
        output = output.permute(2, 0, 1)

        output = self.output_proj(output)

        if not self.batch_first:
            output = output.permute(1, 0, 2)

        return self.dropout(output) + identity


================================================
FILE: plugin/models/backbones/bevformer/transformer.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------

import numpy as np
import torch
import torch.nn as nn
from mmcv.cnn import xavier_init
from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
from mmcv.runner.base_module import BaseModule

from mmdet.models.utils.builder import TRANSFORMER
from torch.nn.init import normal_
from mmcv.runner.base_module import BaseModule
from torchvision.transforms.functional import rotate
from .temporal_self_attention import TemporalSelfAttention
from .spatial_cross_attention import MSDeformableAttention3D
from mmcv.runner import force_fp32, auto_fp16

from einops import rearrange


@TRANSFORMER.register_module()
class PerceptionTransformer(BaseModule):
    """Implements the Detr3D transformer.
    Args:
        as_two_stage (bool): Generate query from encoder features.
            Default: False.
        num_feature_levels (int): Number of feature maps from FPN:
            Default: 4.
        two_stage_num_proposals (int): Number of proposals when set
            `as_two_stage` as True. Default: 300.
    """

    def __init__(self,
                 num_feature_levels=4,
                 num_cams=6,
                 encoder=None,
                 embed_dims=256,
                 use_cams_embeds=True,
                 **kwargs):
        super().__init__(**kwargs)
        self.encoder = build_transformer_layer_sequence(encoder)
        # self.decoder = build_transformer_layer_sequence(decoder)
        self.embed_dims = embed_dims
        self.num_feature_levels = num_feature_levels
        self.num_cams = num_cams
        self.fp16_enabled = False

        self.use_cams_embeds = use_cams_embeds

        self.init_layers()
        
    def init_layers(self):
        """Initialize layers of the Detr3DTransformer."""
        self.level_embeds = nn.Parameter(torch.Tensor(
            self.num_feature_levels, self.embed_dims))
        self.cams_embeds = nn.Parameter(
            torch.Tensor(self.num_cams, self.embed_dims))
        # self.reference_points = nn.Linear(self.embed_dims, 3)

    def init_weights(self):
        """Initialize the transformer weights."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        for m in self.modules():
            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention):
                try:
                    m.init_weight()
                except AttributeError:
                    m.init_weights()
        normal_(self.level_embeds)
        normal_(self.cams_embeds)
        # xavier_init(self.reference_points, distribution='uniform', bias=0.)

    # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
    def get_bev_features(
            self,
            mlvl_feats,
            bev_queries,
            bev_h,
            bev_w,
            bev_pos=None,
            prop_bev=None,
            prev_bev=None,
            warped_history_bev=None,
            **kwargs):
        """
        obtain bev features.
        """

        bs = mlvl_feats[0].size(0)
        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)

        shift = bev_queries.new_tensor((0,0))[None].repeat(bs,1)

        feat_flatten = []
        spatial_shapes = []

        for lvl, feat in enumerate(mlvl_feats):
            bs, num_cam, c, h, w = feat.shape
            spatial_shape = (h, w)
            feat = feat.flatten(3).permute(1, 0, 3, 2)
            if self.use_cams_embeds:
                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
            feat = feat + self.level_embeds[None,
                                            None, lvl:lvl + 1, :].to(feat.dtype)
            spatial_shapes.append(spatial_shape)
            feat_flatten.append(feat)

        feat_flatten = torch.cat(feat_flatten, 2)
        
        spatial_shapes = torch.as_tensor(
            spatial_shapes, dtype=torch.long, device=bev_pos.device)
        level_start_index = torch.cat((spatial_shapes.new_zeros(
            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))

        feat_flatten = feat_flatten.permute(
            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
        
        # Fuse the propagated bev features from the prev step
        if prop_bev is not None:
            prop_bev = rearrange(prop_bev, 'b c h w -> (h w) b c')
            valid_mask = (prop_bev.sum(-1) > 0).to(bev_queries.dtype)[..., None]
            bev_queries = bev_queries * (1 - valid_mask) + prop_bev * valid_mask 
        
        bev_embed = self.encoder(
            bev_queries,
            feat_flatten,
            feat_flatten,
            bev_h=bev_h,
            bev_w=bev_w,
            bev_pos=bev_pos,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            prev_bev=prev_bev,
            shift=shift,
            warped_history_bev=warped_history_bev,
            **kwargs
        )

        return bev_embed

    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
    def forward(self,
                mlvl_feats,
                bev_queries,
                object_query_embed,
                bev_h,
                bev_w,
                grid_length=[0.512, 0.512],
                bev_pos=None,
                reg_branches=None,
                cls_branches=None,
                prev_bev=None,
                **kwargs):
        """Forward function for `Detr3DTransformer`.
        Args:
            mlvl_feats (list(Tensor)): Input queries from
                different level. Each element has shape
                [bs, num_cams, embed_dims, h, w].
            bev_queries (Tensor): (bev_h*bev_w, c)
            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
            object_query_embed (Tensor): The query embedding for decoder,
                with shape [num_query, c].
            reg_branches (obj:`nn.ModuleList`): Regression heads for
                feature maps from each decoder layer. Only would
                be passed when `with_box_refine` is True. Default to None.
        Returns:
            tuple[Tensor]: results of decoder containing the following tensor.
                - bev_embed: BEV features
                - inter_states: Outputs from decoder. If
                    return_intermediate_dec is True output has shape \
                      (num_dec_layers, bs, num_query, embed_dims), else has \
                      shape (1, bs, num_query, embed_dims).
                - init_reference_out: The initial value of reference \
                    points, has shape (bs, num_queries, 4).
                - inter_references_out: The internal value of reference \
                    points in decoder, has shape \
                    (num_dec_layers, bs,num_query, embed_dims)
                - enc_outputs_class: The classification score of \
                    proposals generated from \
                    encoder's feature maps, has shape \
                    (batch, h*w, num_classes). \
                    Only would be returned when `as_two_stage` is True, \
                    otherwise None.
                - enc_outputs_coord_unact: The regression results \
                    generated from encoder's feature maps., has shape \
                    (batch, h*w, 4). Only would \
                    be returned when `as_two_stage` is True, \
                    otherwise None.
        """

        raise NotImplementedError

================================================
FILE: plugin/models/backbones/bevformer_backbone.py
================================================
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F

from mmdet.models import BACKBONES
from mmcv.runner import force_fp32, auto_fp16
from mmdet.models.utils import build_transformer
from mmcv.cnn.bricks.transformer import FFN, build_positional_encoding
from .bevformer.grid_mask import GridMask
from mmdet3d.models import builder
from contextlib import nullcontext


class UpsampleBlock(nn.Module):
    def __init__(self, ins, outs):
        super(UpsampleBlock, self).__init__()
        self.gn = nn.GroupNorm(32, outs)
        self.conv = nn.Conv2d(ins, outs, kernel_size=3,
                              stride=1, padding=1)  # same
        self.relu = nn.ReLU(inplace=True)
    
    def init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x):

        x = self.conv(x)
        x = self.relu(self.gn(x))
        x = self.upsample2x(x)

        return x

    def upsample2x(self, x):
        _, _, h, w = x.shape
        x = F.interpolate(x, size=(h*2, w*2),
                          mode='bilinear', align_corners=True)
        return x

@BACKBONES.register_module()
class BEVFormerBackbone(nn.Module):
    """Head of Detr3D.
    Args:
        with_box_refine (bool): Whether to refine the reference points
            in the decoder. Defaults to False.
        as_two_stage (bool) : Whether to generate the proposal from
            the outputs of encoder.
        transformer (obj:`ConfigDict`): ConfigDict is used for building
            the Encoder and Decoder.
        bev_h, bev_w (int): spatial shape of BEV queries.
    """

    def __init__(self,
                 roi_size,
                 bev_h,
                 bev_w,
                 img_backbone=None, 
                 img_neck=None,               
                 transformer=None,
                 positional_encoding=None,
                 use_grid_mask=True,
                 upsample=False,
                 up_outdim=128,
                 history_steps=None,
                 **kwargs):
        super(BEVFormerBackbone, self).__init__()

        # image feature
        self.default_ratio = 0.5
        self.default_prob = 0.7
        self.grid_mask = GridMask(
            True, True, rotate=1, offset=False, ratio=self.default_ratio, mode=1, 
                prob=self.default_prob)
        self.use_grid_mask = use_grid_mask

        if img_backbone:
            self.img_backbone = builder.build_backbone(img_backbone)
        if img_neck is not None:
            self.img_neck = builder.build_neck(img_neck)
            self.with_img_neck = True
        else:
            self.with_img_neck = False

        self.bev_h = bev_h
        self.bev_w = bev_w

        self.real_w = roi_size[0]
        self.real_h = roi_size[1]
    
        self.positional_encoding = build_positional_encoding(
            positional_encoding)
        self.transformer = build_transformer(transformer)
        self.embed_dims = self.transformer.embed_dims
        
        self.upsample = upsample
        if self.upsample:
            self.up = UpsampleBlock(self.transformer.embed_dims, up_outdim)
        
        self.history_steps = history_steps

        self._init_layers()
        self.init_weights()


    def _init_layers(self):
        """Initialize classification branch and regression branch of head."""
        self.bev_embedding = nn.Embedding(
            self.bev_h * self.bev_w, self.embed_dims)


    def init_weights(self):
        """Initialize weights of the DeformDETR head."""
        self.transformer.init_weights()
        self.img_backbone.init_weights()
        self.img_neck.init_weights()
       
        if self.upsample:
            self.up.init_weights()
    
    # @auto_fp16(apply_to=('img'))
    def extract_img_feat(self, img, img_metas, len_queue=None):
        """Extract features of images."""
        B = img.size(0)
        if img is not None:
            
            # input_shape = img.shape[-2:]
            # # update real input shape of each single img
            # for img_meta in img_metas:
            #     img_meta.update(input_shape=input_shape)

            if img.dim() == 5 and img.size(0) == 1:
                img = img.squeeze(0)
            elif img.dim() == 5 and img.size(0) > 1:
                B, N, C, H, W = img.size()
                img = img.reshape(B * N, C, H, W)
            if self.use_grid_mask:
                img = self.grid_mask(img)

            img_feats = self.img_backbone(img)
            if isinstance(img_feats, dict):
                img_feats = list(img_feats.values())
        else:
            return None
        if self.with_img_neck:
            img_feats = self.img_neck(img_feats)

        img_feats_reshaped = []
        for img_feat in img_feats:
            BN, C, H, W = img_feat.size()
            if len_queue is not None:
                img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W))
            else:
                img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
        
        return img_feats_reshaped

    def forward(self, img, img_metas, timestep, history_bev_feats, history_img_metas, all_history_coord, *args, prev_bev=None, 
                img_backbone_gradient=True, **kwargs):
        """Forward function.
        Args:
            mlvl_feats (tuple[Tensor]): Features from the upstream
                network, each is a 5D-tensor with shape
                (B, N, C, H, W).
            prev_bev: previous bev featues
        Returns:
            all_cls_scores (Tensor): Outputs from the classification head, \
                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
                cls_out_channels should includes background.
            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
                Shape [nb_dec, bs, num_query, 9].
        """
        # Optionally turn off the gradient backprop for the 2D image backbones
        # but always keep the gradients on for the BEV transformer part
        backprop_context = torch.no_grad if img_backbone_gradient is False else nullcontext
        with backprop_context():
            mlvl_feats = self.extract_img_feat(img=img, img_metas=img_metas)

        bs, num_cam, _, _, _ = mlvl_feats[0].shape
        dtype = mlvl_feats[0].dtype
        bev_queries = self.bev_embedding.weight.to(dtype)

        # Prepare the transformed history bev features, add the bev prop fusion here
        if len(history_bev_feats) > 0:
            all_warped_history_feat = []
            for b_i in range(bs):
                history_coord = all_history_coord[b_i]
                history_bev_feats_i = torch.stack([feats[b_i] for feats in history_bev_feats], 0)
                warped_history_feat_i = F.grid_sample(history_bev_feats_i, 
                            history_coord, padding_mode='zeros', align_corners=False)
                all_warped_history_feat.append(warped_history_feat_i)
            all_warped_history_feat = torch.stack(all_warped_history_feat, dim=0) # BTCHW
            prop_bev_feat = all_warped_history_feat[:, -1]
        else:
            all_warped_history_feat = None
            prop_bev_feat = None

        # pad the bev history buffer to fixed length
        if len(history_bev_feats) < self.history_steps:
            num_repeat = self.history_steps - len(history_bev_feats)
            zero_bev_feats = torch.zeros([bs, bev_queries.shape[1], self.bev_h, self.bev_w]).to(bev_queries.device)
            padding_history_bev_feats = torch.stack([zero_bev_feats,] * num_repeat, dim=1)
            if all_warped_history_feat is not None:
                all_warped_history_feat = torch.cat([padding_history_bev_feats, all_warped_history_feat], dim=1)
            else:
                all_warped_history_feat = padding_history_bev_feats
        
        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), device=bev_queries.device).to(dtype)
        bev_pos = self.positional_encoding(bev_mask).to(dtype)

        outs =  self.transformer.get_bev_features(
                mlvl_feats,
                bev_queries,
                self.bev_h,
                self.bev_w,
                grid_length=(self.real_h / self.bev_h,
                            self.real_w / self.bev_w),
                bev_pos=bev_pos,
                prop_bev=prop_bev_feat,
                img_metas=img_metas,
                prev_bev=prev_bev,
                warped_history_bev=all_warped_history_feat,
            )
        
        outs = outs.unflatten(1,(self.bev_h,self.bev_w)).permute(0,3,1,2).contiguous()
        
        if self.upsample:
            outs = self.up(outs)
        
        return outs, mlvl_feats


================================================
FILE: plugin/models/heads/MapDetectorHead.py
================================================
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Conv2d, Linear, build_activation_layer, bias_init_with_prob, xavier_init
from mmcv.runner import force_fp32
from mmcv.cnn.bricks.transformer import build_positional_encoding
from mmdet.models.utils import build_transformer
from mmdet.models import build_loss

from mmdet.core import multi_apply, reduce_mean, build_assigner, build_sampler
from mmdet.models import HEADS
from mmdet.models.utils.transformer import inverse_sigmoid

from einops import rearrange

@HEADS.register_module(force=True)
class MapDetectorHead(nn.Module):

    def __init__(self, 
                 num_queries,
                 num_classes=3,
                 in_channels=128,
                 embed_dims=256,
                 score_thr=0.1,
                 num_points=20,
                 coord_dim=2,
                 roi_size=(60, 30),
                 different_heads=True,
                 predict_refine=False,
                 bev_pos=None,
                 sync_cls_avg_factor=True,
                 bg_cls_weight=0.,
                 trans_loss_weight=0.0,
                 transformer=dict(),
                 loss_cls=dict(),
                 loss_reg=dict(),
                 assigner=dict()
                ):
        super().__init__()
        self.num_queries = num_queries
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.embed_dims = embed_dims
        self.different_heads = different_heads
        self.predict_refine = predict_refine
        self.bev_pos = bev_pos
        self.num_points = num_points
        self.coord_dim = coord_dim
        
        self.sync_cls_avg_factor = sync_cls_avg_factor
        self.bg_cls_weight = bg_cls_weight
        
        self.trans_loss_weight = trans_loss_weight
        # NOTE: below is a simple MLP to transform the query from prev-frame to cur-frame,
        # we moved the propagation part outside,
            
        self.register_buffer('roi_size', torch.tensor(roi_size, dtype=torch.float32))
        origin = (-roi_size[0]/2, -roi_size[1]/2)
        self.register_buffer('origin', torch.tensor(origin, dtype=torch.float32))

        sampler_cfg = dict(type='PseudoSampler')
        self.sampler = build_sampler(sampler_cfg, context=self)

        self.transformer = build_transformer(transformer)

        self.loss_cls = build_loss(loss_cls)
        self.loss_reg = build_loss(loss_reg)
        self.assigner = build_assigner(assigner)

        if self.loss_cls.use_sigmoid:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1
        
        self._init_embedding()
        self._init_branch()
        self.init_weights()


    def init_weights(self):
        """Initialize weights of the DeformDETR head."""

        for p in self.input_proj.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        
        xavier_init(self.reference_points_embed, distribution='uniform', bias=0.)

        self.transformer.init_weights()

        # init prediction branch
        for m in self.reg_branches:
            for param in m.parameters():
                if param.dim() > 1:
                    nn.init.xavier_uniform_(param)

        # focal loss init
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            if isinstance(self.cls_branches, nn.ModuleList):
                for m in self.cls_branches:
                    if hasattr(m, 'bias'):
                        nn.init.constant_(m.bias, bias_init)
            else:
                m = self.cls_branches
                nn.init.constant_(m.bias, bias_init)
        
        if hasattr(self, 'query_alpha'):
            for m in self.query_alpha:
                for param in m.parameters():
                    if param.dim() > 1:
                        nn.init.zeros_(param)

    def _init_embedding(self):
        positional_encoding = dict(
            type='SinePositionalEncoding',
            num_feats=self.embed_dims//2,
            normalize=True
        )
        self.bev_pos_embed = build_positional_encoding(positional_encoding)

        # query_pos_embed & query_embed
        self.query_embedding = nn.Embedding(self.num_queries,
                                            self.embed_dims)

        self.reference_points_embed = nn.Linear(self.embed_dims, self.num_points * 2)
        
    def _init_branch(self,):
        """Initialize classification branch and regression branch of head."""
        self.input_proj = Conv2d(
            self.in_channels, self.embed_dims, kernel_size=1)

        cls_branch = Linear(self.embed_dims, self.cls_out_channels)

        reg_branch = [
            Linear(self.embed_dims, 2*self.embed_dims),
            nn.LayerNorm(2*self.embed_dims),
            nn.ReLU(),
            Linear(2*self.embed_dims, 2*self.embed_dims),
            nn.LayerNorm(2*self.embed_dims),
            nn.ReLU(),
            Linear(2*self.embed_dims, self.num_points * self.coord_dim),
        ]
        reg_branch = nn.Sequential(*reg_branch)

        num_layers = self.transformer.decoder.num_layers
        if self.different_heads:
            cls_branches = nn.ModuleList(
                [copy.deepcopy(cls_branch) for _ in range(num_layers)])
            reg_branches = nn.ModuleList(
                [copy.deepcopy(reg_branch) for _ in range(num_layers)])
        else:
            cls_branches = nn.ModuleList(
                [cls_branch for _ in range(num_layers)])
            reg_branches = nn.ModuleList(
                [reg_branch for _ in range(num_layers)])

        self.reg_branches = reg_branches
        self.cls_branches = cls_branches

    def _prepare_context(self, bev_features):
        """Prepare class label and vertex context."""
        device = bev_features.device

        # Add 2D coordinate grid embedding
        B, C, H, W = bev_features.shape
        bev_mask = bev_features.new_zeros(B, H, W)
        bev_pos_embeddings = self.bev_pos_embed(bev_mask) # (bs, embed_dims, H, W)
        bev_features = self.input_proj(bev_features) + bev_pos_embeddings # (bs, embed_dims, H, W)
    
        assert list(bev_features.shape) == [B, self.embed_dims, H, W]
        return bev_features

    def forward_train(self, bev_features, img_metas, gts, track_query_info=None, memory_bank=None, return_matching=False):
        '''
        Args:
            bev_feature (List[Tensor]): shape [B, C, H, W]
                feature in bev view
        Outs:
            preds_dict (list[dict]):
                lines (Tensor): Classification score of all
                    decoder layers, has shape
                    [bs, num_query, 2*num_points]
                scores (Tensor):
                    [bs, num_query,]
        '''

        bev_features = self._prepare_context(bev_features)

        bs, C, H, W = bev_features.shape
        img_masks = bev_features.new_zeros((bs, H, W))
        # pos_embed = self.positional_encoding(img_masks)
        pos_embed = None

        query_embedding = self.query_embedding.weight[None, ...].repeat(bs, 1, 1) # [B, num_q, embed_dims]
        input_query_num = self.num_queries

        init_reference_points = self.reference_points_embed(query_embedding).sigmoid() # (bs, num_q, 2*num_pts)
        init_reference_points = init_reference_points.view(-1, self.num_queries, self.num_points, 2) # (bs, num_q, num_pts, 2)
        
        assert list(init_reference_points.shape) == [bs, self.num_queries, self.num_points, 2]
        assert list(query_embedding.shape) == [bs, self.num_queries, self.embed_dims]

        # Prepare the propagated track queries, concat with the original dummy queries
        if track_query_info is not None and 'track_query_hs_embeds' in track_query_info[0]:
            new_query_embeds = []
            new_init_ref_pts = []
            for b_i in range(bs):
                new_queries = torch.cat([track_query_info[b_i]['track_query_hs_embeds'], query_embedding[b_i], 
                           track_query_info[b_i]['pad_hs_embeds']], dim=0)
                new_query_embeds.append(new_queries)
                init_ref = rearrange(init_reference_points[b_i], 'n k c -> n (k c)', c=2)
                new_ref = torch.cat([track_query_info[b_i]['trans_track_query_boxes'], init_ref, 
                           track_query_info[b_i]['pad_query_boxes']], dim=0)
                new_ref = rearrange(new_ref, 'n (k c) -> n k c', c=2)
                new_init_ref_pts.append(new_ref)
                #print('length of track queries', track_query_info[b_i]['track_query_hs_embeds'].shape[0])


            # concat to get the track+dummy queries
            query_embedding = torch.stack(new_query_embeds, dim=0)
            init_reference_points = torch.stack(new_init_ref_pts, dim=0)
            query_kp_mask = torch.stack([t['query_padding_mask'] for t in track_query_info], dim=0)
        else:
            query_kp_mask = query_embedding.new_zeros((bs, self.num_queries), dtype=torch.bool)
        
        # outs_dec: (num_layers, num_qs, bs, embed_dims)
        inter_queries, init_reference, inter_references = self.transformer(
            mlvl_feats=[bev_features,],
            mlvl_masks=[img_masks.type(torch.bool)],
            query_embed=query_embedding,
            mlvl_pos_embeds=[pos_embed], # not used
            memory_query=None,
            init_reference_points=init_reference_points,
            reg_branches=self.reg_branches,
            cls_branches=self.cls_branches,
            predict_refine=self.predict_refine,
            query_key_padding_mask=query_kp_mask, # mask used in self-attn,
            memory_bank=memory_bank,
        )

        outputs = []
        for i, (queries) in enumerate(inter_queries):
            reg_points = inter_references[i] # (bs, num_q, num_points, 2)
            bs = reg_points.shape[0]
            reg_points = reg_points.view(bs, -1, 2*self.num_points) # (bs, num_q, 2*num_points)

            scores = self.cls_branches[i](queries) # (bs, num_q, num_classes)

            reg_points_list = []
            scores_list = []
            for i in range(len(scores)):
                # padding queries should not be output
                reg_points_list.append(reg_points[i])
                scores_list.append(scores[i])

            pred_dict = {
                'lines': reg_points_list,
                'scores': scores_list
            }
            if return_matching:
                pred_dict['hs_embeds'] = queries
            outputs.append(pred_dict)

        # Pass in the track query information to massage the cost matrix
        loss_dict, det_match_idxs, det_match_gt_idxs, gt_info_list, matched_reg_cost = \
                self.loss(gts=gts, preds=outputs, track_info=track_query_info)

        if return_matching:
            return loss_dict, outputs[-1], det_match_idxs[-1], det_match_gt_idxs[-1], matched_reg_cost[-1], gt_info_list[-1]
        else:
            return outputs, loss_dict, det_match_idxs, det_match_gt_idxs, gt_info_list
    
    def forward_test(self, bev_features, img_metas, track_query_info=None, memory_bank=None):
        '''
        Args:
            bev_feature (List[Tensor]): shape [B, C, H, W]
                feature in bev view
        Outs:
            preds_dict (list[dict]):
                lines (Tensor): Classification score of all
                    decoder layers, has shape
                    [bs, num_query, 2*num_points]
                scores (Tensor):
                    [bs, num_query,]
        '''

        bev_features = self._prepare_context(bev_features)

        bs, C, H, W = bev_features.shape
        assert bs == 1, 'Only support bs=1 per-gpu for inference'
        
        img_masks = bev_features.new_zeros((bs, H, W))
        # pos_embed = self.positional_encoding(img_masks)
        pos_embed = None

        query_embedding = self.query_embedding.weight[None, ...].repeat(bs, 1, 1) # [B, num_q, embed_dims]
        input_query_num = self.num_queries
        # num query: self.num_query + self.topk
        
        init_reference_points = self.reference_points_embed(query_embedding).sigmoid() # (bs, num_q, 2*num_pts)
        init_reference_points = init_reference_points.view(-1, self.num_queries, self.num_points, 2) # (bs, num_q, num_pts, 2)
        
        assert list(init_reference_points.shape) == [bs, input_query_num, self.num_points, 2]
        assert list(query_embedding.shape) == [bs, input_query_num, self.embed_dims]

        # Prepare the propagated track queries, concat with the original dummy queries
        if track_query_info is not None and 'track_query_hs_embeds' in track_query_info[0]:
            prev_hs_embed = torch.stack([t['track_query_hs_embeds'] for t in track_query_info])
            prev_boxes = torch.stack([t['trans_track_query_boxes'] for t in track_query_info])
            prev_boxes = rearrange(prev_boxes, 'b n (k c) -> b n k c', c=2)

            # concat to get the track+dummy queries
            query_embedding = torch.cat([prev_hs_embed, query_embedding], dim=1)
            init_reference_points = torch.cat([prev_boxes, init_reference_points], dim=1)
            
        query_kp_mask = query_embedding.new_zeros((bs, query_embedding.shape[1]), dtype=torch.bool)

        # outs_dec: (num_layers, num_qs, bs, embed_dims)
        inter_queries, init_reference, inter_references = self.transformer(
            mlvl_feats=[bev_features,],
            mlvl_masks=[img_masks.type(torch.bool)],
            query_embed=query_embedding,
            mlvl_pos_embeds=[pos_embed], # not used
            memory_query=None,
            init_reference_points=init_reference_points,
            reg_branches=self.reg_branches,
            cls_branches=self.cls_branches,
            predict_refine=self.predict_refine,
            query_key_padding_mask=query_kp_mask, # mask used in self-attn,
            memory_bank=memory_bank,
        )

        outputs = []
        for i_query, (queries) in enumerate(inter_queries):
            reg_points = inter_references[i_query] # (bs, num_q, num_points, 2)
            bs = reg_points.shape[0]
            reg_points = reg_points.view(bs, -1, 2*self.num_points) # (bs, num_q, 2*num_points)
            scores = self.cls_branches[i_query](queries) # (bs, num_q, num_classes)

            reg_points_list = []
            scores_list = []
            for i in range(len(scores)):
                # padding queries should not be output
                reg_points_list.append(reg_points[i])
                scores_list.append(scores[i])

            pred_dict = {
                'lines': reg_points_list,
                'scores': scores_list,
                'hs_embeds': queries,
            }
            outputs.append(pred_dict)

        return outputs

    @force_fp32(apply_to=('score_pred', 'lines_pred', 'gt_lines'))
    def _get_target_single(self,
                           score_pred,
                           lines_pred,
                           gt_labels,
                           gt_lines,
                           track_info=None,
                           gt_bboxes_ignore=None):
        """
            Compute regression and classification targets for one image.
            Outputs from a single decoder layer of a single feature level are used.
            Args:
                score_pred (Tensor): Box score logits from a single decoder layer
                    for one image. Shape [num_query, cls_out_channels].
                lines_pred (Tensor):
                    shape [num_query, 2*num_points]
                gt_labels (torch.LongTensor)
                    shape [num_gt, ]
                gt_lines (Tensor):
                    shape [num_gt, 2*num_points].
                
            Returns:
                tuple[Tensor]: a tuple containing the following for one sample.
                    - labels (LongTensor): Labels of each image.
                        shape [num_query, 1]
                    - label_weights (Tensor]): Label weights of each image.
                        shape [num_query, 1]
                    - lines_target (Tensor): Lines targets of each image.
                        shape [num_query, num_points, 2]
                    - lines_weights (Tensor): Lines weights of each image.
                        shape [num_query, num_points, 2]
                    - pos_inds (Tensor): Sampled positive indices for each image.
                    - neg_inds (Tensor): Sampled negative indices for each image.
        """
        num_pred_lines = len(lines_pred)
        # assigner and sampler
        
        # We massage the matching cost here using the track info, following
        # the 3-type supervision of TrackFormer/MOTR
        assign_result, gt_permute_idx, matched_reg_cost = self.assigner.assign(preds=dict(lines=lines_pred, scores=score_pred,),
                                             gts=dict(lines=gt_lines,
                                                      labels=gt_labels, ),
                                             track_info=track_info,
                                             gt_bboxes_ignore=gt_bboxes_ignore)
        sampling_result = self.sampler.sample(
            assign_result, lines_pred, gt_lines)
        num_gt = len(gt_lines)
        pos_inds = sampling_result.pos_inds
        neg_inds = sampling_result.neg_inds
        pos_gt_inds = sampling_result.pos_assigned_gt_inds

        labels = gt_lines.new_full(
                (num_pred_lines, ), self.num_classes, dtype=torch.long) # (num_q, )
        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
        label_weights = gt_lines.new_ones(num_pred_lines) # (num_q, )

        lines_target = torch.zeros_like(lines_pred) # (num_q, 2*num_pts)
        lines_weights = torch.zeros_like(lines_pred) # (num_q, 2*num_pts)
        
        if num_gt > 0:
            if gt_permute_idx is not None: # using permute invariant label
                # gt_permute_idx: (num_q, num_gt)
                # pos_inds: which query is positive
                # pos_gt_inds: which gt each pos pred is assigned
                # single_matched_gt_permute_idx: which permute order is matched
                single_matched_gt_permute_idx = gt_permute_idx[
                    pos_inds, pos_gt_inds
                ]
                lines_target[pos_inds] = gt_lines[pos_gt_inds, single_matched_gt_permute_idx].type(
                    lines_target.dtype) # (num_q, 2*num_pts)
            else:
                lines_target[pos_inds] = sampling_result.pos_gt_bboxes.type(
                    lines_target.dtype) # (num_q, 2*num_pts)
        
        lines_weights[pos_inds] = 1.0 # (num_q, 2*num_pts)

        # normalization
        # n = lines_weights.sum(-1, keepdim=True) # (num_q, 1)
        # lines_weights = lines_weights / n.masked_fill(n == 0, 1) # (num_q, 2*num_pts)
        # [0, ..., 0] for neg ind and [1/npts, ..., 1/npts] for pos ind

        return (labels, label_weights, lines_target, lines_weights,
                pos_inds, neg_inds, pos_gt_inds, matched_reg_cost)

    # @force_fp32(apply_to=('preds', 'gts'))
    def get_targets(self, preds, gts, track_info=None, gt_bboxes_ignore_list=None):
        """
            Compute regression and classification targets for a batch image.
            Outputs from a single decoder layer of a single feature level are used.
            Args:
                preds (dict): 
                    - lines (Tensor): shape (bs, num_queries, 2*num_points)
                    - scores (Tensor): shape (bs, num_queries, num_class_channels)
                gts (dict):
                    - class_label (list[Tensor]): tensor shape (num_gts, )
                    - lines (list[Tensor]): tensor shape (num_gts, 2*num_points)
                gt_bboxes_ignore_list (list[Tensor], optional): Bounding
                    boxes which can be ignored for each image. Default None.
            Returns:
                tuple: a tuple containing the following targets.
                    - labels_list (list[Tensor]): Labels for all images.
                    - label_weights_list (list[Tensor]): Label weights for all \
                        images.
                    - lines_targets_list (list[Tensor]): Lines targets for all \
                        images.
                    - lines_weight_list (list[Tensor]): Lines weights for all \
                        images.
                    - num_total_pos (int): Number of positive samples in all \
                        images.
                    - num_total_neg (int): Number of negative samples in all \
                        images.
        """
        assert gt_bboxes_ignore_list is None, \
            'Only supports for gt_bboxes_ignore setting to None.'

        # format the inputs
        gt_labels = gts['labels']
        gt_lines = gts['lines']

        lines_pred = preds['lines']

        if track_info is None:
            track_info = [track_info for _ in range(len(gt_labels))]

        (labels_list, label_weights_list,
        lines_targets_list, lines_weights_list,
        pos_inds_list, neg_inds_list,pos_gt_inds_list, matched_reg_cost) = multi_apply(
            self._get_target_single, preds['scores'], lines_pred,
            gt_labels, gt_lines, track_info, gt_bboxes_ignore=gt_bboxes_ignore_list)
        
        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
        num_total_neg = sum((inds.numel() for inds in neg_inds_list))

        if track_info[0] is not None:
            # remove the padding elements from the neg counting
            padding_mask = torch.cat([t['query_padding_mask'] for t in track_info], dim=0)
            num_padding = padding_mask.sum()
            num_total_neg -= num_padding
        
        new_gts = dict(
            labels=labels_list, # list[Tensor(num_q, )], length=bs
            label_weights=label_weights_list, # list[Tensor(num_q, )], length=bs, all ones
            lines=lines_targets_list, # list[Tensor(num_q, 2*num_pts)], length=bs
            lines_weights=lines_weights_list, # list[Tensor(num_q, 2*num_pts)], length=bs
        )

        return new_gts, num_total_pos, num_total_neg, pos_inds_list, pos_gt_inds_list, matched_reg_cost

    # @force_fp32(apply_to=('preds', 'gts'))
    def loss_single(self,
                    preds,
                    gts,
                    track_info=None,
                    gt_bboxes_ignore_list=None,
                    reduction='none'):
        """
            Loss function for outputs from a single decoder layer of a single
            feature level.
            Args:
                preds (dict): 
                    - lines (Tensor): shape (bs, num_queries, 2*num_points)
                    - scores (Tensor): shape (bs, num_queries, num_class_channels)
                gts (dict):
                    - class_label (list[Tensor]): tensor shape (num_gts, )
                    - lines (list[Tensor]): tensor shape (num_gts, 2*num_points)
                gt_bboxes_ignore_list (list[Tensor], optional): Bounding
                    boxes which can be ignored for each image. Default None.
            Returns:
                dict[str, Tensor]: A dictionary of loss components for outputs from
                    a single decoder layer.
        """

        # Get target for each sample
        new_gts, num_total_pos, num_total_neg, pos_inds_list, pos_gt_inds_list, matched_reg_cost =\
            self.get_targets(preds, gts, track_info, gt_bboxes_ignore_list)

        # Batched all data
        # for k, v in new_gts.items():
        #     new_gts[k] = torch.stack(v, dim=0) # tensor (bs, num_q, ...)

        # construct weighted avg_factor to match with the official DETR repo
        cls_avg_factor = num_total_pos * 1.0 + \
            num_total_neg * self.bg_cls_weight
        
        if self.sync_cls_avg_factor:
            cls_avg_factor = reduce_mean(
                preds['scores'][0].new_tensor([cls_avg_factor]))
        cls_avg_factor = max(cls_avg_factor, 1)

        if track_info is not None:
            cat_padding_mask = torch.cat([t['query_padding_mask'] for t in track_info], dim=0)
            padding_loss_mask = ~cat_padding_mask

        # Classification loss
        # since the inputs needs the second dim is the class dim, we permute the prediction.
        pred_scores = torch.cat(preds['scores'], dim=0) # (bs*num_q, cls_out_channles)
        cls_scores = pred_scores.reshape(-1, self.cls_out_channels) # (bs*num_q, cls_out_channels)
        cls_labels = torch.cat(new_gts['labels'], dim=0).reshape(-1) # (bs*num_q, )
        cls_weights = torch.cat(new_gts['label_weights'], dim=0).reshape(-1) # (bs*num_q, )
        if track_info is not None:
            cls_weights = cls_weights * padding_loss_mask.float()
        
        loss_cls = self.loss_cls(
            cls_scores, cls_labels, cls_weights, avg_factor=cls_avg_factor)
        
        # Compute the average number of gt boxes across all gpus, for
        # normalization purposes
        num_total_pos = loss_cls.new_tensor([num_total_pos])
        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()

        pred_lines = torch.cat(preds['lines'], dim=0)
        gt_lines = torch.cat(new_gts['lines'], dim=0)
        line_weights = torch.cat(new_gts['lines_weights'], dim=0)
        if track_info is not None:
            line_weights = line_weights * padding_loss_mask[:, None].float()

        assert len(pred_lines) == len(gt_lines)
        assert len(gt_lines) == len(line_weights)

        loss_reg = self.loss_reg(
            pred_lines, gt_lines, line_weights, avg_factor=num_total_pos)

        loss_dict = dict(
            cls=loss_cls,
            reg=loss_reg,
        )

        new_gts_info = {
            'labels': new_gts['labels'],
            'lines': new_gts['lines'],
        }

        return loss_dict, pos_inds_list, pos_gt_inds_list, matched_reg_cost, new_gts_info
    
    @force_fp32(apply_to=('gt_lines_list', 'preds_dicts'))
    def loss(self,
             gts,
             preds,
             gt_bboxes_ignore=None,
             track_info=None,
             reduction='mean',
            ):
        """
            Loss Function.
            Args:
                gts (list[dict]): list length: num_layers
                    dict {
                        'label': list[tensor(num_gts, )], list length: batchsize,
                        'line': list[tensor(num_gts, 2*num_points)], list length: batchsize,
                        ...
                    }
                preds (list[dict]): list length: num_layers
                    dict {
                        'lines': tensor(bs, num_queries, 2*num_points),
                        'scores': tensor(bs, num_queries, class_out_channels),
                    }
                    
                gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
                    which can be ignored for each image. Default None.
            Returns:
                dict[str, Tensor]: A dictionary of loss components.
        """
        assert gt_bboxes_ignore is None, \
            f'{self.__class__.__name__} only supports ' \
            f'for gt_bboxes_ignore setting to None.'

        track_info = [track_info for _ in range(len(gts))]
        # Since there might have multi layer
        losses, pos_inds_lists, pos_gt_inds_lists, matched_reg_costs, gt_info_list = multi_apply(
            self.loss_single, preds, gts, track_info, reduction=reduction)

        # Format the losses
        loss_dict = dict()
        # loss from the last decoder layer
        for k, v in losses[-1].items():
            loss_dict[k] = v
        
        # Loss from other decoder layers
        num_dec_layer = 0
        for loss in losses[:-1]:
            for k, v in loss.items():
                loss_dict[f'd{num_dec_layer}.{k}'] = v
            num_dec_layer += 1

        return loss_dict, pos_inds_lists, pos_gt_inds_lists, gt_info_list, matched_reg_costs
    
    def post_process(self, preds_dict, tokens, track_dict=None, thr=0.0):
        lines = preds_dict['lines'] # List[Tensor(num_queries, 2*num_points)]
        bs = len(lines)
        scores = preds_dict['scores'] # (bs, num_queries, 3)

        results = []
        for i in range(bs):
            tmp_vectors = lines[i]
            # set up the prop_flags
            tmp_prop_flags = torch.zeros(tmp_vectors.shape[0]).bool()
            tmp_prop_flags[-100:] = 0
            tmp_prop_flags[:-100] = 1
            num_preds, num_points2 = tmp_vectors.shape
            tmp_vectors = tmp_vectors.view(num_preds, num_points2//2, 2)

            if self.loss_cls.use_sigmoid:
                tmp_scores, tmp_labels = scores[i].max(-1)
                tmp_scores = tmp_scores.sigmoid()
                pos = tmp_scores > thr
            else:
                assert self.num_classes + 1 == self.cls_out_channels
                tmp_scores, tmp_labels = scores[i].max(-1)
                bg_cls = self.cls_out_channels
                pos = tmp_labels != bg_cls

            tmp_vectors = tmp_vectors[pos]
            tmp_scores = tmp_scores[pos]
            tmp_labels = tmp_labels[pos]
            tmp_prop_flags = tmp_prop_flags[pos]

            if len(tmp_scores) == 0:
                single_result = {
                'vectors': [],
                'scores': [],
                'labels': [],
                'props': [],
                'token': tokens[i]
            }
            else:
                single_result = {
                    'vectors': tmp_vectors.detach().cpu().numpy(),
                    'scores': tmp_scores.detach().cpu().numpy(),
                    'labels': tmp_labels.detach().cpu().numpy(),
                    'props': tmp_prop_flags.detach().cpu().numpy(),
                    'token': tokens[i]
                }

            # also save the tracking information for analyzing
            if track_dict is not None and len(track_dict['lines'])>0:
                tmp_track_scores = track_dict['scores'][i]
                tmp_track_vectors = track_dict['lines'][i]
                tmp_track_scores, tmp_track_labels = tmp_track_scores.max(-1)
                tmp_track_scores = tmp_track_scores.sigmoid()
                single_result['track_scores'] = tmp_track_scores.detach().cpu().numpy()
                single_result['track_vectors'] = tmp_track_vectors.detach().cpu().numpy()
                single_result['track_labels'] = tmp_track_labels.detach().cpu().numpy()
            else:
                single_result['track_scores'] = []
                single_result['track_vectors'] = []
                single_result['track_labels'] = []

            results.append(single_result)
    
        return results
    
    def prepare_temporal_propagation(self, preds_dict, scene_name, local_idx, memory_bank=None, 
                        thr_track=0.1, thr_det=0.5):
        lines = preds_dict['lines'] # List[Tensor(num_queries, 2*num_points)]
        queries = preds_dict['hs_embeds']
        bs = len(lines)
        assert bs == 1, 'now only support bs=1 for temporal-evolving inference'
        scores = preds_dict['scores'] # (bs, num_queries, 3)

        first_frame = local_idx == 0

        tmp_vectors = lines[0]
        tmp_queries = queries[0]

        # focal loss
        if self.loss_cls.use_sigmoid:
            tmp_scores, tmp_labels = scores[0].max(-1)
            tmp_scores = tmp_scores.sigmoid()
            pos_track = tmp_scores[:-100] > thr_track
            pos_det = tmp_scores[-100:] > thr_det
            pos = torch.cat([pos_track, pos_det], dim=0)
        else:
            raise RuntimeError('The experiment uses sigmoid for cls outputs')

        pos_vectors = tmp_vectors[pos]
        pos_labels = tmp_labels[pos]
        pos_queries = tmp_queries[pos]
        pos_scores = tmp_scores[pos]

        if first_frame:
            global_ids = torch.arange(len(pos_vectors))
            num_instance = len(pos_vectors)
        else:
            prop_ids = self.prop_info['global_ids']
            prop_num_instance = self.prop_info['num_instance']
            global_ids_track = prop_ids[pos_track]
            num_newborn = int(pos_det.sum())
            global_ids_newborn = torch.arange(num_newborn) + prop_num_instance
            global_ids = torch.cat([global_ids_track, global_ids_newborn])
            num_instance = prop_num_instance + num_newborn
            
        self.prop_info = {
            'vectors': pos_vectors,
            'queries': pos_queries,
            'scores': pos_scores,
            'labels': pos_labels,
            'scene_name': scene_name,
            'local_idx': local_idx,
            'global_ids': global_ids,
            'num_instance': num_instance,
        }

        if memory_bank is not None:
            if first_frame:
                num_tracks = 0
            else:
                num_tracks = self.prop_active_tracks
            pos_out_inds = torch.where(pos)[0]
            prev_out = {
                'hs_embeds': queries,
                'scores': scores,
            }
            memory_bank.update_memory(0, first_frame, pos_out_inds, prev_out, num_tracks, local_idx, memory_bank.curr_t)
            self.prop_active_tracks = len(pos_out_inds)
        
        save_pos_results = {
            'vectors': pos_vectors.cpu().numpy(),
            'scores': pos_scores.cpu().numpy(),
            'labels': pos_labels.cpu().numpy(),
            'global_ids': global_ids.cpu().numpy(),
            'scene_name': scene_name,
            'local_idx': local_idx,
            'num_instance': num_instance,
        }

        return save_pos_results
    
    def get_track_info(self, scene_name, local_idx):
        prop_info = self.prop_info
        assert prop_info['scene_name'] == scene_name and (prop_info['local_idx']+1 == local_idx or \
            prop_info['local_idx'] == local_idx)
            
        vectors = prop_info['vectors']
        queries = prop_info['queries']
        device = queries.device

        target = {}
        target['track_query_hs_embeds'] = queries
        target['track_query_boxes'] = vectors
        track_info = [target, ]

        return track_info
    
    def get_self_iter_track_query(self, preds_dict):
        num_tracks = self.prop_active_tracks

        lines = preds_dict['lines'] # List[Tensor(num_queries, 2*num_points)]
        queries = preds_dict['hs_embeds']
        bs = len(lines)
        assert bs == 1, 'now only support bs=1 for temporal-evolving inference'
        scores = preds_dict['scores'] # (bs, num_queries, 3)

        queries = queries[0][:num_tracks]
        vectors = lines[0][:num_tracks]

        target = {}
        target['track_query_hs_embeds'] = queries
        target['track_query_boxes'] = vectors
        track_info = [target, ]
        return track_info


    def clear_temporal_cache(self):
        self.prop_info = None

    def train(self, *args, **kwargs):
        super().train(*args, **kwargs)
    
    def eval(self):
        super().eval()

    def forward(self, *args, return_loss=True, **kwargs):
        if return_loss:
            return self.forward_train(*args, **kwargs)
        else:
            return self.forward_test(*args, **kwargs)

================================================
FILE: plugin/models/heads/MapSegHead.py
================================================
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Conv2d, Linear, build_activation_layer, bias_init_with_prob, xavier_init
from mmcv.runner import force_fp32
from mmcv.cnn.bricks.transformer import build_positional_encoding
from mmdet.models import build_loss

from mmdet.models import HEADS

from einops import repeat


@HEADS.register_module(force=True)
class MapSegHead(nn.Module):

    def __init__(self, 
                 num_classes=3,
                 in_channels=256,
                 embed_dims=256,
                 bev_size=(100,50),
                 canvas_size=(200,100),
                 loss_seg=dict(),
                 loss_dice=dict(),
        ):
        super().__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.embed_dims = embed_dims
        self.bev_size = bev_size
        self.canvas_size = canvas_size

        self.loss_seg = build_loss(loss_seg)
        self.loss_dice = build_loss(loss_dice)

        if self.loss_seg.use_sigmoid:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1

        assert canvas_size[0] % bev_size[0] == 0, 'canvas size must be a multiple of the bev size'
        self.num_up_blocks = int(np.log2(canvas_size[0] // bev_size[0]))

        self.conv_in = nn.Conv2d(in_channels, embed_dims, kernel_size=3, padding=1, bias=False)
        self.relu = nn.ReLU(inplace=True)

        self.conv_mid_layers = nn.ModuleList([])
        self.downsample_layers = nn.ModuleList([])
        for _ in range(self.num_up_blocks):
            conv_mid = nn.Sequential(
                nn.Upsample(scale_factor=2, mode='nearest'),
                nn.Conv2d(embed_dims, embed_dims, kernel_size=3, padding=1),
                nn.ReLU(inplace=True),
            )
            self.conv_mid_layers.append(conv_mid)
            self.downsample_layers.append(nn.Upsample(scale_factor=0.5, mode='bilinear'))

        self.conv_out = nn.Conv2d(embed_dims, self.cls_out_channels, kernel_size=1, padding=0)
        

        self.init_weights()
    
    def init_weights(self):
        if self.loss_seg.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            m = self.conv_out
            nn.init.constant_(m.bias, bias_init)
    
    def forward_train(self, bev_features, gts, history_coords):
        x = self.relu(self.conv_in(bev_features))
        for conv_mid in self.conv_mid_layers:
            x = conv_mid(x)
        preds = self.conv_out(x)

        seg_loss = self.loss_seg(preds, gts)
        dice_loss = self.loss_dice(preds, gts)
        
        # downsample the features to the original bev size
        seg_feats = x
        for downsample in self.downsample_layers:
            seg_feats = downsample(seg_feats)

        return preds, seg_feats, seg_loss, dice_loss
        
    def forward_test(self, bev_features):
        x = self.relu(self.conv_in(bev_features))
        for conv_mid in self.conv_mid_layers:
            x = conv_mid(x)
        preds = self.conv_out(x)
        seg_feats = x
        for downsample in self.downsample_layers:
            seg_feats = downsample(seg_feats)
        return preds, seg_feats
    
    def train(self, *args, **kwargs):
        super().train(*args, **kwargs)
    
    def eval(self):
        super().eval()

    def forward(self, *args, return_loss=True, **kwargs):
        if return_loss:
            return self.forward_train(*args, **kwargs)
        else:
            return self.forward_test(*args, **kwargs)

================================================
FILE: plugin/models/heads/__init__.py
================================================
from .MapDetectorHead import MapDetectorHead
from .MapSegHead import MapSegHead


================================================
FILE: plugin/models/heads/base_map_head.py
================================================
from abc import ABCMeta, abstractmethod

import torch.nn as nn
from mmcv.runner import auto_fp16
from mmcv.utils import print_log

from mmdet.utils import get_root_logger


class BaseMapHead(nn.Module, metaclass=ABCMeta):
    """Base class for mappers."""

    def __init__(self):
        super(BaseMapHead, self).__init__()
        self.fp16_enabled = False

    def init_weights(self, pretrained=None):
        """Initialize the weights in detector.
        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        if pretrained is not None:
            logger = get_root_logger()
            print_log(f'load model from: {pretrained}', logger=logger)

    @auto_fp16(apply_to=('img', ))
    def forward(self, *args, **kwargs):
        pass
        
    @abstractmethod
    def loss(self, pred, gt):
        '''
        Compute loss
        Output:
            dict(
                loss: torch.Tensor
                log_vars: dict(
                    str: float,
                )
                num_samples: int
            )
        '''
        return
        
    @abstractmethod
    def post_process(self, pred):
        '''
        convert model predictions to vectorized outputs
        the output format should be consistent with the evaluation function
        '''
        return


================================================
FILE: plugin/models/losses/__init__.py
================================================
from .detr_loss import LinesL1Loss, MasksLoss, LenLoss
from .seg_loss import MaskFocalLoss, MaskDiceLoss

================================================
FILE: plugin/models/losses/detr_loss.py
================================================
import torch
from torch import nn as nn
from torch.nn import functional as F
from mmdet.models.losses import l1_loss, smooth_l1_loss
from mmdet.models.losses.utils import weighted_loss
import mmcv

from mmdet.models.builder import LOSSES


@LOSSES.register_module()
class LinesL1Loss(nn.Module):

    def __init__(self, reduction='mean', loss_weight=1.0, beta=0.5):
        """
            L1 loss. The same as the smooth L1 loss
            Args:
                reduction (str, optional): The method to reduce the loss.
                    Options are "none", "mean" and "sum".
                loss_weight (float, optional): The weight of loss.
        """

        super().__init__()
        self.reduction = reduction
        self.loss_weight = loss_weight
        self.beta = beta

    def forward(self,
                pred,
                target,
                weight=None,
                avg_factor=None,
                reduction_override=None):
        """Forward function.
        Args:
            pred (torch.Tensor): The prediction.
                shape: [bs, ...]
            target (torch.Tensor): The learning target of the prediction.
                shape: [bs, ...]
            weight (torch.Tensor, optional): The weight of loss for each
                prediction. Defaults to None. 
                it's useful when the predictions are not all valid.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
            reduction_override (str, optional): The reduction method used to
                override the original reduction method of the loss.
                Defaults to None.
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)

        if self.beta > 0:
            loss = smooth_l1_loss(
                pred, target, weight, reduction=reduction, avg_factor=avg_factor, beta=self.beta)
        
        else:
            loss = l1_loss(
                pred, target, weight, reduction=reduction, avg_factor=avg_factor)
        
        num_points = pred.shape[-1] // 2
        loss = loss / num_points

        return loss*self.loss_weight


@mmcv.jit(derivate=True, coderize=True)
@weighted_loss
def bce(pred, label, class_weight=None):
    """
        pred: B,nquery,npts
        label: B,nquery,npts
    """

    if label.numel() == 0:
        return pred.sum() * 0
    assert pred.size() == label.size()

    loss = F.binary_cross_entropy_with_logits(
        pred, label.float(), pos_weight=class_weight, reduction='none')

    return loss


@LOSSES.register_module()
class MasksLoss(nn.Module):

    def __init__(self, reduction='mean', loss_weight=1.0):
        super(MasksLoss, self).__init__()
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self,
                pred,
                target,
                weight=None,
                avg_factor=None,
                reduction_override=None):
        """Forward function.
        Args:
            xxx
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)

        loss = bce(pred, target, weight, reduction=reduction,
                   avg_factor=avg_factor)

        return loss*self.loss_weight

@mmcv.jit(derivate=True, coderize=True)
@weighted_loss
def ce(pred, label, class_weight=None):
    """
        pred: B*nquery,npts
        label: B*nquery,
    """

    if label.numel() == 0:
        return pred.sum() * 0

    loss = F.cross_entropy(
        pred, label, weight=class_weight, reduction='none')

    return loss


@LOSSES.register_module()
class LenLoss(nn.Module):

    def __init__(self, reduction='mean', loss_weight=1.0):
        super(LenLoss, self).__init__()
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self,
                pred,
                target,
                weight=None,
                avg_factor=None,
                reduction_override=None):
        """Forward function.
        Args:
            xxx
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)

        loss = ce(pred, target, weight, reduction=reduction,
                   avg_factor=avg_factor)

        return loss*self.loss_weight

================================================
FILE: plugin/models/losses/seg_loss.py
================================================
import torch
from torch import nn as nn
from torch.nn import functional as F
import mmcv

from mmdet.models.builder import LOSSES
from mmdet.models.losses import FocalLoss, weight_reduce_loss

from einops import rearrange


def py_sigmoid_focal_loss(pred,
                          target,
                          weight=None,
                          gamma=2.0,
                          alpha=0.25,
                          reduction='mean',
                          avg_factor=None):
    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.

    Args:
        pred (torch.Tensor): The prediction with shape (N, C), C is the
            number of classes
        target (torch.Tensor): The learning label of the prediction.
        weight (torch.Tensor, optional): Sample-wise loss weight.
        gamma (float, optional): The gamma for calculating the modulating
            factor. Defaults to 2.0.
        alpha (float, optional): A balanced form for Focal Loss.
            Defaults to 0.25.
        reduction (str, optional): The method used to reduce the loss into
            a scalar. Defaults to 'mean'.
        avg_factor (int, optional): Average factor that is used to average
            the loss. Defaults to None.
    """
    pred_sigmoid = pred.sigmoid()
    target = target.type_as(pred)
    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
    focal_weight = (alpha * target + (1 - alpha) *
                    (1 - target)) * pt.pow(gamma)
    loss = F.binary_cross_entropy_with_logits(
        pred, target, reduction='none') * focal_weight
    if weight is not None:
        if weight.shape != loss.shape:
            if weight.size(0) == loss.size(0):
                # For most cases, weight is of shape (num_priors, ),
                #  which means it does not have the second axis num_class
                weight = weight.view(-1, 1)
            else:
                # Sometimes, weight per anchor per class is also needed. e.g.
                #  in FSAF. But it may be flattened of shape
                #  (num_priors x num_class, ), while loss is still of shape
                #  (num_priors, num_class).
                assert weight.numel() == loss.numel()
                weight = weight.view(loss.size(0), -1)
        assert weight.ndim == loss.ndim
    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
    return loss


@LOSSES.register_module()
class MaskFocalLoss(FocalLoss):
    def __init__(self,**kwargs):
        super(MaskFocalLoss, self).__init__(**kwargs)
    
    def forward(self, 
                pred, 
                target,
                weight=None,
                avg_factor=None,
                reduction_override=None):
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        if not self.use_sigmoid:
            raise NotImplementedError
        
        num_classes = pred.size(1)
        loss = 0
        for index in range(num_classes):
            loss += self.loss_weight * py_sigmoid_focal_loss(
                pred[:,index],
                target[:,index],
                weight,
                gamma=self.gamma,
                alpha=self.alpha,
                reduction=reduction,
                avg_factor=avg_factor)

        loss /= num_classes
        return loss * self.loss_weight


@LOSSES.register_module()
class MaskDiceLoss(nn.Module):
    """Dice Loss PyTorch
        Created by: Zhang Shuai
        Email: shuaizzz666@gmail.com
        dice_loss = 1 - 2*p*t / (p^2 + t^2). p and t represent predict and target.
    Args:
        weight: An array of shape [C,]
        predict: A float32 tensor of shape [N, C, *], for Semantic segmentation task is [N, C, H, W]
        target: A int64 tensor of shape [N, *], for Semantic segmentation task is [N, H, W]
    Return:
        diceloss
    """
    def __init__(self, loss_weight):
        super(MaskDiceLoss, self).__init__()
        self.smooth = 1e-5
        self.loss_weight = loss_weight

    def forward(self, pred, target):
        bs, num_classes = pred.shape[:2]
        pred = rearrange(pred, 'b n h w -> b n (h w)')
        target = rearrange(target, 'b n h w -> b n (h w)')
        pred = pred.sigmoid()
        intersection = torch.sum(pred * target, dim=2)  # (N, C)
        union = torch.sum(pred.pow(2), dim=2) + torch.sum(target, dim=2)  # (N, C)
        ## p^2 + t^2 >= 2*p*t, target_onehot^2 == target_onehot
        dice_coef = (2 * intersection + self.smooth) / (union + self.smooth)  # (N, C)
        dice_loss = 1 - torch.mean(dice_coef)  # 1
        
        loss = self.loss_weight * dice_loss
        return loss

================================================
FILE: plugin/models/mapers/MapTracker.py
================================================
"""
    MapTracker main module, adapted from StreamMapNet
"""
import numpy as np
import torch
import torch.nn as nn

from mmdet3d.models.builder import (build_backbone, build_head)

from .base_mapper import BaseMapper, MAPPERS
from ..utils.query_update import MotionMLP
from copy import deepcopy
from mmdet.core import multi_apply

from einops import rearrange, repeat
from scipy.spatial.transform import Rotation as R

from .vector_memory import VectorInstanceMemory


@MAPPERS.register_module()
class MapTracker(BaseMapper):

    def __init__(self,
                 bev_h,
                 bev_w,
                 roi_size,
                 backbone_cfg=dict(),
                 head_cfg=dict(),
                 neck_cfg=None,
                 seg_cfg=None,
                 model_name=None, 
                 pretrained=None,
                 history_steps=None,
                 test_time_history_steps=None,
                 mem_select_dist_ranges=[0,0,0,0],
                 skip_vector_head=False,
                 freeze_bev=False,
                 freeze_bev_iters=None,
                 track_fp_aug=True,
                 use_memory=False,
                 mem_len=None,
                 mem_warmup_iters=-1,
                 **kwargs):
        super().__init__()

        #Attribute
        self.model_name = model_name
        self.last_epoch = None
  
        self.backbone = build_backbone(backbone_cfg)

        if neck_cfg is not None:
            self.neck = build_head(neck_cfg)
        else:
            self.neck = nn.Identity()
        
        self.head = build_head(head_cfg)
        self.num_decoder_layers = self.head.transformer.decoder.num_layers
        self.skip_vector_head = skip_vector_head
        self.freeze_bev = freeze_bev # whether freeze bev related parameters
        self.freeze_bev_iters = freeze_bev_iters # whether freeze bev related parameters
        self.track_fp_aug = track_fp_aug
        self.use_memory = use_memory
        self.mem_warmup_iters = mem_warmup_iters

        # the track query propagation module, using relative pose
        c_dim = 7 # quaternion for rotation (4) + translation (3)
        self.query_propagate = MotionMLP(c_dim=c_dim, f_dim=self.head.embed_dims, identity=True)

        # BEV semantic seg head
        self.seg_decoder = build_head(seg_cfg)
        
        # BEV 
        self.bev_h = bev_h
        self.bev_w = bev_w
        self.roi_size = roi_size
        self.history_steps = history_steps

        self.mem_len = mem_len

        # Set up test time memory selection hyper-parameters
        if test_time_history_steps is None:
            self.test_time_history_steps = history_steps
        else:
            self.test_time_history_steps = test_time_history_steps
        self.mem_select_dist_ranges = mem_select_dist_ranges

        # vector instance memory module
        if self.use_memory:
            self.memory_bank = VectorInstanceMemory(
                dim_in=head_cfg.embed_dims,
                number_ins=head_cfg.num_queries,
                bank_size=mem_len,
                mem_len=mem_len,
                mem_select_dist_ranges=self.mem_select_dist_ranges,
            )

        xmin, xmax = -roi_size[0]/2, roi_size[0]/2
        ymin, ymax = -roi_size[1]/2, roi_size[1]/2
        x = torch.linspace(xmin, xmax, bev_w)
        y = torch.linspace(ymax, ymin, bev_h)
        y, x = torch.meshgrid(y, x)
        z = torch.zeros_like(x)
        ones = torch.ones_like(x)
        plane = torch.stack([x, y, z, ones], dim=-1)
        self.register_buffer('plane', plane.double())
        
        self.init_weights(pretrained)

    def init_weights(self, pretrained=None):
        """Initialize model weights."""
        if pretrained:
            import logging
            logger = logging.getLogger()
            from mmcv.runner import load_checkpoint
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        else:
            try:
                self.neck.init_weights()
            except AttributeError:
                pass

    def temporal_propagate(self, curr_bev_feats, img_metas, all_history_curr2prev, all_history_prev2curr, use_memory,
                           track_query_info=None, timestep=None, get_trans_loss=False):
        '''
        Args:
            curr_bev_feat: torch.Tensor of shape [B, neck_input_channels, H, W]
            img_metas: current image metas (List of #bs samples)
            bev_memory: where to load and store (training and testing use different buffer)
            pose_memory: where to load and store (training and testing use different buffer)

        Out:
            fused_bev_feat: torch.Tensor of shape [B, neck_input_channels, H, W]
        '''

        bs = curr_bev_feats.size(0)

        if get_trans_loss: # init the trans_loss related variables here
            trans_reg_loss = curr_bev_feats.new_zeros((1,))
            trans_cls_loss = curr_bev_feats.new_zeros((1,))
            back_trans_reg_loss = curr_bev_feats.new_zeros((1,))
            back_trans_cls_loss = curr_bev_feats.new_zeros((1,))
            num_pos = 0
            num_tracks = 0

        if use_memory:
            self.memory_bank.clear_dict()
            
        for b_i in range(bs):
            curr_e2g_trans = self.plane.new_tensor(img_metas[b_i]['ego2global_translation'], dtype=torch.float64)
            curr_e2g_rot = self.plane.new_tensor(img_metas[b_i]['ego2global_rotation'], dtype=torch.float64)

            if use_memory:
                self.memory_bank.curr_rot[b_i] = curr_e2g_rot
                self.memory_bank.curr_trans[b_i] = curr_e2g_trans
                if self.memory_bank.curr_t > 0:
                    self.memory_bank.trans_memory_bank(self.query_propagate, b_i, img_metas[b_i])

            # transform the track queries
            if track_query_info is not None:
                history_curr2prev_matrix = all_history_curr2prev[b_i]
                history_prev2curr_matrix = all_history_prev2curr[b_i]

                track_pts = track_query_info[b_i]['track_query_boxes'].clone()
                track_pts = rearrange(track_pts, 'n (k c) -> n k c', c=2)
                # from (0, 1) to (-30, 30) or (-15, 15), prep for transform
                track_pts = self._denorm_lines(track_pts)

                # Transform the track ref-points using relative pose between prev and curr
                N, num_points = track_pts.shape[0], track_pts.shape[1]
                track_pts = torch.cat([
                    track_pts,
                    track_pts.new_zeros((N, num_points, 1)), # z-axis
                    track_pts.new_ones((N, num_points, 1)) # 4-th dim
                ], dim=-1) # (num_prop, num_pts, 4)

                pose_matrix = history_prev2curr_matrix[-1].float()[:3]
                rot_mat = pose_matrix[:, :3].cpu().numpy()
                rot = R.from_matrix(rot_mat)
                translation = pose_matrix[:, 3] 
                trans_matrix = history_prev2curr_matrix[-1].clone()

                # Add training-time perturbation here for the transformation matrix
                if self.training:
                    rot, translation = self.add_noise_to_pose(rot, translation)            
                    trans_matrix[:3, :3] = torch.tensor(rot.as_matrix()).to(trans_matrix.device)
                    trans_matrix[:3, 3] = torch.tensor(translation).to(trans_matrix.device)

                trans_track_pts = torch.einsum('lk,ijk->ijl', trans_matrix, track_pts.double()).float()
                trans_track_pts = trans_track_pts[..., :2]
                trans_track_pts = self._norm_lines(trans_track_pts)
                trans_track_pts = torch.clip(trans_track_pts, min=0., max=1.)
                trans_track_pts = rearrange(trans_track_pts, 'n k c -> n (k c)', c=2)
                track_query_info[b_i]['trans_track_query_boxes'] = trans_track_pts
                
                prop_q = track_query_info[b_i]['track_query_hs_embeds']

                rot_quat = torch.tensor(rot.as_quat()).float().to(pose_matrix.device)
                pose_info = torch.cat([rot_quat.view(-1), translation], dim=0)                

                track_query_updated = self.query_propagate(
                    prop_q, # (topk, embed_dims)
                    pose_info.repeat(len(prop_q), 1)
                )
                # Do not let future-frame loss backprop through the track queries
                track_query_info[b_i]['track_query_hs_embeds'] = track_query_updated.clone().detach()

                if get_trans_loss:
                    pred = self.head.reg_branches[-1](track_query_updated).sigmoid() # (num_prop, 2*num_pts)
                    pred_scores = self.head.cls_branches[-1](track_query_updated)
                    assert list(pred.shape) == [N, 2*num_points]

                    gt_pts = track_query_info[b_i]['track_query_gt_lines'].clone()
                    gt_labels = track_query_info[b_i]['track_query_gt_labels'].clone()
                    weights = gt_pts.new_ones((N, 2*num_points))
                    weights_labels = gt_labels.new_ones((N,))
                    bg_idx = gt_labels == 3
                    num_pos = num_pos + (N - bg_idx.sum())
                    num_tracks += len(gt_labels)
                    weights[bg_idx, :] = 0.0
                
                    gt_pts = rearrange(gt_pts, 'n (k c) -> n k c', c=2)
                    denormed_targets = self._denorm_lines(gt_pts)
                    denormed_targets = torch.cat([
                        denormed_targets,
                        denormed_targets.new_zeros((N, num_points, 1)), # z-axis
                        denormed_targets.new_ones((N, num_points, 1)) # 4-th dim
                    ], dim=-1) # (num_prop, num_pts, 4)
                    assert list(denormed_targets.shape) == [N, num_points, 4]

                    curr_targets = torch.einsum('lk,ijk->ijl', trans_matrix.float(), denormed_targets)
                    curr_targets = curr_targets[..., :2]
                    normed_targets = self._norm_lines(curr_targets)
                    normed_targets = rearrange(normed_targets, 'n k c -> n (k c)', c=2)
                    # set the weight of invalid normed targets to 0 (outside current bev frame)
                    invalid_bev_mask = (normed_targets <= 0) | (normed_targets>=1)
                    weights[invalid_bev_mask] = 0
                    # (num_prop, 2*num_pts)
                    trans_reg_loss += self.head.loss_reg(pred, normed_targets, weights, avg_factor=1.0)
                    if len(gt_labels) > 0:
                        trans_score = self.head.loss_cls(pred_scores, gt_labels, weights_labels, avg_factor=1.0)
                    else:
                        trans_score = 0.0
                    trans_cls_loss += trans_score

                    # backward trans loss
                    pose_matrix_inv = torch.inverse(trans_matrix).float()[:3]
                    rot_mat_inv = pose_matrix_inv[:, :3].cpu().numpy()

                    rot_inv = R.from_matrix(rot_mat_inv)
                    rot_quat_inv = torch.tensor(rot_inv.as_quat()).float().to(pose_matrix_inv.device)
                    translation_inv = pose_matrix_inv[:, 3]
                    pose_info_inv = torch.cat([rot_quat_inv.view(-1), translation_inv], dim=0)                
                    track_query_backtrans = self.query_propagate(
                        track_query_updated, # (topk, embed_dims)
                        pose_info_inv.repeat(len(prop_q), 1)
                    )
                    pred_backtrans = self.head.reg_branches[-1](track_query_backtrans).sigmoid() # (num_prop, 2*num_pts)
                    pred_scores_backtrans = self.head.cls_branches[-1](track_query_backtrans)
                    prev_gt_pts = track_query_info[b_i]['track_query_gt_lines']
                    back_trans_reg_loss += self.head.loss_reg(pred_backtrans, prev_gt_pts, weights, avg_factor=1.0)
                    if len(gt_labels) > 0:
                        trans_score_bak = self.head.loss_cls(pred_scores_backtrans, gt_labels, weights_labels, avg_factor=1.0)
                    else:
                        trans_score_bak = 0.0
                    back_trans_cls_loss += trans_score_bak

        if get_trans_loss:
            trans_loss = self.head.trans_loss_weight * (trans_reg_loss / (num_pos + 1e-10) + 
                            trans_cls_loss / (num_tracks + 1e-10))
            back_trans_loss = self.head.trans_loss_weight * (back_trans_reg_loss / (num_pos + 1e-10) +
                                    back_trans_cls_loss / (num_tracks + 1e-10))
            trans_loss_dict = {
                'f_trans': trans_loss,
                'b_trans': back_trans_loss,
            }
            return trans_loss_dict
    
    def add_noise_to_pose(self, rot, trans):
        rot_euler = rot.as_euler('zxy')
        # 0.08 mean is around 5-degree, 3-sigma is 15-degree
        noise_euler = np.random.randn(*list(rot_euler.shape)) * 0.08
        rot_euler += noise_euler
        noisy_rot = R.from_euler('zxy', rot_euler)

        # error within 0.25 meter
        noise_trans = torch.randn_like(trans) * 0.25
        noise_trans[2] = 0
        noisy_trans = trans + noise_trans

        return noisy_rot, noisy_trans

    def process_history_info(self, img_metas, history_img_metas):
        bs = len(img_metas)
        all_history_curr2prev = []
        all_history_prev2curr = []
        all_history_coord = []

        if len(history_img_metas) == 0:
            return all_history_curr2prev, all_history_prev2curr, all_history_coord

        for b_i in range(bs):
            history_e2g_trans = torch.stack([self.plane.new_tensor(prev[b_i]['ego2global_translation'], dtype=torch.float64) for prev in history_img_metas], dim=0)
            history_e2g_rot = torch.stack([self.plane.new_tensor(prev[b_i]['ego2global_rotation'], dtype=torch.float64) for prev in history_img_metas], dim=0)
            
            curr_e2g_trans = self.plane.new_tensor(img_metas[b_i]['ego2global_translation'], dtype=torch.float64)
            curr_e2g_rot = self.plane.new_tensor(img_metas[b_i]['ego2global_rotation'], dtype=torch.float64)

            # Do the coords transformation for all features in the history buffer
            ## Prepare the transformation matrix
            history_g2e_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0)
            history_g2e_matrix[:, :3, :3] = torch.transpose(history_e2g_rot, 1, 2)
            history_g2e_matrix[:, :3, 3] = -torch.bmm(torch.transpose(history_e2g_rot, 1, 2), history_e2g_trans[..., None]).squeeze(-1)

            curr_g2e_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device)
            curr_g2e_matrix[:3, :3] = curr_e2g_rot.T
            curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans)

            curr_e2g_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device)
            curr_e2g_matrix[:3, :3] = curr_e2g_rot
            curr_e2g_matrix[:3, 3] = curr_e2g_trans

            history_e2g_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0)
            history_e2g_matrix[:, :3, :3] = history_e2g_rot
            history_e2g_matrix[:, :3, 3] = history_e2g_trans

            history_curr2prev_matrix = torch.bmm(history_g2e_matrix, repeat(curr_e2g_matrix,'n1 n2 -> r n1 n2', r=len(history_g2e_matrix)))
            history_prev2curr_matrix = torch.bmm(repeat(curr_g2e_matrix, 'n1 n2 -> r n1 n2', r=len(history_e2g_matrix)), history_e2g_matrix)

            history_coord = torch.einsum('nlk,ijk->nijl', history_curr2prev_matrix, self.plane).float()[..., :2]

            # from (-30, 30) or (-15, 15) to (-1, 1)
            history_coord[..., 0] = history_coord[..., 0] / (self.roi_size[0]/2)
            history_coord[..., 1] = -history_coord[..., 1] / (self.roi_size[1]/2)

            all_history_curr2prev.append(history_curr2prev_matrix)
            all_history_prev2curr.append(history_prev2curr_matrix)
            all_history_coord.append(history_coord)
        
        return all_history_curr2prev, all_history_prev2curr, all_history_coord
        

    def forward_train(self, img, vectors, semantic_mask, points=None, img_metas=None, all_prev_data=None,
                      all_local2global_info=None, **kwargs):
        '''
        Args:
            img: torch.Tensor of shape [B, N, 3, H, W]
                N: number of cams
            vectors: list[list[Tuple(lines, length, label)]]
                - lines: np.array of shape [num_points, 2]. 
                - length: int
                - label: int
                len(vectors) = batch_size
                len(vectors[_b]) = num of lines in sample _b
            img_metas: 
                img_metas['lidar2img']: [B, N, 4, 4]
        Out:
            loss, log_vars, num_sample
        '''
        #  prepare labels and images
        gts, img, img_metas, valid_idx, points = self.batch_data(
            vectors, img, img_metas, img.device, points)
        bs = img.shape[0]

        _use_memory = self.use_memory and self.num_iter > self.mem_warmup_iters
        
        if all_prev_data is not None:
            num_prev_frames = len(all_prev_data)        
            all_gts_prev, all_img_prev, all_img_metas_prev, all_semantic_mask_prev  = [], [], [], []
            for prev_data in all_prev_data:
                gts_prev, img_prev, img_metas_prev, valid_idx_prev, _ = self.batch_data(
                    prev_data['vectors'], prev_data['img'], prev_data['img_metas'], img.device      
                )
                all_gts_prev.append(gts_prev)
                all_img_prev.append(img_prev)
                all_img_metas_prev.append(img_metas_prev)
                all_semantic_mask_prev.append(prev_data['semantic_mask'])
        else:
            num_prev_frames = 0

        assert points is None

        if self.skip_vector_head:
            backprop_backbone_ids = [0, num_prev_frames] # first and last frame train the backbone (bev pretrain)
        else:
            backprop_backbone_ids = [num_prev_frames, ] # only the last frame trains the backbone (all other settings)

        track_query_info = None
        all_loss_dict_prev = []
        all_trans_loss = []
        all_outputs_prev = []

        self.tracked_query_length = {}

        if _use_memory:
            self.memory_bank.set_bank_size(self.mem_len)
            self.memory_bank.init_memory(bs=bs)

        # History records for bev features
        history_bev_feats = []
        history_img_metas = []
        
        gt_semantic = torch.flip(semantic_mask, [2,])

        # Iterate through all prev frames
        for t in range(num_prev_frames):
            # Backbone for prev
            img_backbone_gradient = (t in backprop_backbone_ids)

            all_history_curr2prev, all_history_prev2curr, all_history_coord =  \
                    self.process_history_info(all_img_metas_prev[t], history_img_metas)

            _bev_feats, mlvl_feats = self.backbone(all_img_prev[t], all_img_metas_prev[t], t, history_bev_feats, 
                        history_img_metas, all_history_coord, points=None, 
                        img_backbone_gradient=img_backbone_gradient)

            # Neck for prev
            bev_feats = self.neck(_bev_feats)

            if _use_memory:
                self.memory_bank.curr_t = t
            
            # Transform prev-frame feature & pts to curr frame
            if self.skip_vector_head or t == 0:
                self.temporal_propagate(bev_feats, all_img_metas_prev[t], all_history_curr2prev, 
                        all_history_prev2curr, _use_memory, track_query_info, timestep=t, get_trans_loss=False)
            else:
                trans_loss_dict = self.temporal_propagate(bev_feats, all_img_metas_prev[t], all_history_curr2prev, 
                        all_history_prev2curr, _use_memory, track_query_info, timestep=t, get_trans_loss=True)

                ########################################################
                # Debugging use: visualize the first-frame track query. and the corresponding 
                # ground-truth information     
                # Do this for every timestep > 0
                #self._viz_temporal_supervision(outputs_prev, track_query_info, gts_next[-1], gts_prev[-1], 
                #                gts_semantic_curr, gts_semantic_prev, img_metas_next, img_metas_prev, t)
                #import pdb; pdb.set_trace()
                ########################################################
            
            img_metas_prev = all_img_metas_prev[t]
            img_metas_next = all_img_metas_prev[t+1] if t < num_prev_frames-1 else img_metas
            gts_prev = all_gts_prev[t]
            gts_next = all_gts_prev[t+1] if t!=num_prev_frames-1 else gts
            gts_semantic_prev = torch.flip(all_semantic_mask_prev[t], [2,])
            gts_semantic_curr = torch.flip(all_semantic_mask_prev[t+1], [2,]) if t!=num_prev_frames-1 else gt_semantic

            local2global_prev = all_local2global_info[t]
            local2global_next = all_local2global_info[t+1]

            # Compute the semantic segmentation loss
            seg_preds, seg_feats, seg_loss, seg_dice_loss = self.seg_decoder(bev_feats, gts_semantic_prev,
                    all_history_coord, return_loss=True)

            # Save the history 
            history_bev_feats.append(bev_feats)
            history_img_metas.append(all_img_metas_prev[t])
            if len(history_bev_feats) > self.history_steps:
                history_bev_feats.pop(0)
                history_img_metas.pop(0)
            
            if not self.skip_vector_head:
                # Prepare the two-frame instance matching info
                gt_cur2prev, gt_prev2cur = self.get_two_frame_matching(local2global_prev, local2global_next, 
                                                                       gts_prev, gts_next)
                if t == 0:
                    memory_bank = None
                else:
                    memory_bank = self.memory_bank if _use_memory else None
                # 1). Compute the loss for prev frame
                # 2). Get the matching results for computing the track query to next frame
                loss_dict_prev, outputs_prev, prev_inds_list, prev_gt_inds_list, prev_matched_reg_cost, \
                    prev_gt_list = self.head(
                                        bev_features=bev_feats, 
                                        img_metas=img_metas_prev, 
                                        gts=gts_prev,
                                        track_query_info=track_query_info,
                                        memory_bank=memory_bank,
                                        return_loss=True,
                                        return_matching=True)
                all_outputs_prev.append(outputs_prev)

                if t > 0:
                    all_trans_loss.append(trans_loss_dict)

                # Do the query prop and negative sampling, prepare the corrpespnding
                # updated G.T. labels. The prepared queries will be passed to the model,
                # and combind with the original queries inside the head model
                pos_th = 0.4
                track_query_info = self.prepare_track_queries_and_targets(gts_next, prev_inds_list, 
                    prev_gt_inds_list, prev_matched_reg_cost, prev_gt_list, outputs_prev, gt_cur2prev, gt_prev2cur, 
                    img_metas_prev, _use_memory, pos_th=pos_th, timestep=t)
            else:
                loss_dict_prev = {}

            loss_dict_prev['seg'] = seg_loss
            loss_dict_prev['seg_dice'] = seg_dice_loss

            all_loss_dict_prev.append(loss_dict_prev)

        if _use_memory:
            self.memory_bank.curr_t = num_prev_frames

        # NOTE: we separate the last frame to be consistent with single-frame only setting)
        # Backbone for curr
        img_backbone_gradient = num_prev_frames in backprop_backbone_ids

        all_history_curr2prev, all_history_prev2curr, all_history_coord = self.process_history_info(img_metas, history_img_metas)

        _bev_feats, mlvl_feats = self.backbone(img, img_metas, num_prev_frames, history_bev_feats, history_img_metas, all_history_coord,
                    points=None, img_backbone_gradient=img_backbone_gradient)
        # Neck for curr
        bev_feats = self.neck(_bev_feats)

        if self.skip_vector_head or num_prev_frames == 0:
            # Transform prev-frame feature & pts to curr frame using the relative pose
            assert track_query_info is None
            self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, 
                        all_history_prev2curr, _use_memory, track_query_info, timestep=num_prev_frames, get_trans_loss=False)
        else:
            trans_loss_dict = self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, 
                        all_history_prev2curr, _use_memory, track_query_info, timestep=num_prev_frames, get_trans_loss=True)            
            all_trans_loss.append(trans_loss_dict)

            ########################################################
            # Debugging use: visualize the first-frame track query. and the corresponding 
            # ground-truth information     
            # Do this for every timestep > 0
            #assert num_prev_frames > 0
            #self._viz_temporal_supervision(outputs_prev, track_query_info, gts_next[-1], gts_prev[-1], gt_semantic,
            #        gts_semantic_prev, img_metas_next, img_metas_prev, timestep=num_prev_frames)
            #import pdb; pdb.set_trace()
            ########################################################

        seg_preds, seg_feats, seg_loss, seg_dice_loss = self.seg_decoder(bev_feats, gt_semantic, 
                all_history_coord, return_loss=True)
        
        if not self.skip_vector_head:
            memory_bank = self.memory_bank if _use_memory else None
            # 3. run the head again and compute the loss for the second frame
            preds_list, loss_dict, det_match_idxs, det_match_gt_idxs, gt_list = self.head(
                bev_features=bev_feats, 
                img_metas=img_metas, 
                gts=gts,
                track_query_info=track_query_info,
                memory_bank=memory_bank,
                return_loss=True)
        else:
            loss_dict = {}
        
        loss_dict['seg'] = seg_loss
        loss_dict['seg_dice'] = seg_dice_loss

        # format loss, average over all frames (2 frames for now)
        loss = 0
        losses_t = []
        for loss_dict_t in (all_loss_dict_prev + [loss_dict,]):
            loss_t = 0
            for name, var in loss_dict_t.items():
                loss_t = loss_t + var
            losses_t.append(loss_t)
            loss += loss_t
        
        for trans_loss_dict_t in all_trans_loss:
            trans_loss_t = trans_loss_dict_t['f_trans'] + trans_loss_dict_t['b_trans']
            loss += trans_loss_t
        
        # update the log
        log_vars = {k: v.item() for k, v in loss_dict.items()}

        for t, loss_dict_t in enumerate(all_loss_dict_prev):
            log_vars_t = {k+'_t{}'.format(t): v.item() for k, v in loss_dict_t.items()}
            log_vars.update(log_vars_t)
        
        for t, loss_t in enumerate(losses_t):
            log_vars.update({'total_t{}'.format(t): loss_t.item()})
        
        for t, trans_loss_dict_t in enumerate(all_trans_loss):
            log_vars_t = {k+'_t{}'.format(t): v.item() for k, v in trans_loss_dict_t.items()}
            log_vars.update(log_vars_t)
        
        log_vars.update({'total': loss.item()})
        num_sample = img.size(0)
        return loss, log_vars, num_sample

    @torch.no_grad()
    def forward_test(self, img, points=None, img_metas=None, seq_info=None, **kwargs):
        '''
            inference pipeline
        '''

        assert img.shape[0] == 1, 'Only support bs=1 per-gpu for inference'

        tokens = []
        for img_meta in img_metas:
            tokens.append(img_meta['token'])
        
        scene_name, local_idx, seq_length  = seq_info[0]
        first_frame = (local_idx == 0)
        img_metas[0]['local_idx'] = local_idx
    
        if first_frame:
            if self.use_memory:
                self.memory_bank.set_bank_size(self.test_time_history_steps)
                #self.memory_bank.set_bank_size(self.mem_len)
                self.memory_bank.init_memory(bs=1)
            self.history_bev_feats_all = []
            self.history_img_metas_all = []
        
        if self.use_memory:
            self.memory_bank.curr_t = local_idx
        
        selected_mem_ids = self.select_memory_entries(self.history_img_metas_all, img_metas)
        history_img_metas = [self.history_img_metas_all[idx] for idx in selected_mem_ids]
        history_bev_feats = [self.history_bev_feats_all[idx] for idx in selected_mem_ids]

        all_history_curr2prev, all_history_prev2curr, all_history_coord =  \
                    self.process_history_info(img_metas, history_img_metas)

        _bev_feats, mlvl_feats = self.backbone(img, img_metas, local_idx, history_bev_feats, history_img_metas,
                        all_history_coord, points=points)
        
        img_shape = [_bev_feats.shape[2:] for i in range(_bev_feats.shape[0])]
        # Neck
        bev_feats = self.neck(_bev_feats)

        if self.skip_vector_head or first_frame:
            self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, \
                    all_history_prev2curr, self.use_memory, track_query_info=None)
            seg_preds, seg_feats = self.seg_decoder(bev_features=bev_feats, return_loss=False)
            if not self.skip_vector_head:
                preds_list = self.head(bev_feats, img_metas=img_metas, return_loss=False)
            track_dict = None
        else:
            # Using the saved prev-frame output to prepare the track query inputs
            track_query_info = self.head.get_track_info(scene_name, local_idx)
            # Transform prev-frame feature & pts to curr frame using the relative pose
            self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, 
                all_history_prev2curr, self.use_memory, track_query_info)
            seg_preds, seg_feats = self.seg_decoder(bev_features=bev_feats, return_loss=False)

            # Run the vector map decoder with instance-level memory
            memory_bank = self.memory_bank if self.use_memory else None
            preds_list = self.head(bev_feats, img_metas=img_metas, 
                        track_query_info=track_query_info, memory_bank=memory_bank,
                        return_loss=False)
            track_dict = self._process_track_query_info(track_query_info)
            
        if not self.skip_vector_head:
            # take predictions from the last layer
            preds_dict = preds_list[-1]
        else:
            preds_dict = None

        # Save the BEV and meta-info history 
        self.history_bev_feats_all.append(bev_feats)
        self.history_img_metas_all.append(img_metas)

        if len(self.history_bev_feats_all) > self.test_time_history_steps:
            self.history_bev_feats_all.pop(0)
            self.history_img_metas_all.pop(0)
        
        if not self.skip_vector_head:
            memory_bank = self.memory_bank if self.use_memory else None
            thr_det = 0.4 if first_frame else 0.6
            pos_results = self.head.prepare_temporal_propagation(preds_dict, scene_name, local_idx, 
                                        memory_bank, thr_track=0.5, thr_det=thr_det)
    
        if not self.skip_vector_head:
            results_list = self.head.post_process(preds_dict, tokens, track_dict)
            results_list[0]['pos_results'] = pos_results
            results_list[0]['meta'] = img_metas[0]
        else:
            results_list = [{'vectors': [],
                'scores': [],
                'labels': [],
                'props': [],
                'token': token} for token in tokens]

        # Add the segmentation preds to the results to be saved
        for b_i in range(len(results_list)):
            tmp_scores, tmp_labels = seg_preds[b_i].max(0)
            tmp_scores = tmp_scores.sigmoid()
            preds_i = torch.zeros(tmp_labels.shape, dtype=torch.uint8).to(tmp_scores.device)
            pos_ids = tmp_scores >= 0.4
            preds_i[pos_ids] = tmp_labels[pos_ids].type(torch.uint8) + 1
            preds_i = preds_i.cpu().numpy()
            results_list[b_i]['semantic_mask'] = preds_i
            if 'token' not in results_list[b_i]:
                results_list[b_i]['token'] = tokens[b_i]

        return results_list

    def batch_data(self, vectors, imgs, img_metas, device, points=None):
        bs = len(vectors)
        # filter none vector's case
        num_gts = []
        for idx in range(bs):
            num_gts.append(sum([len(v) for k, v in vectors[idx].items()]))
        valid_idx = [i for i in range(bs) if num_gts[i] > 0]
        assert len(valid_idx) == bs # make sure every sample has gts

        all_labels_list = []
        all_lines_list = []
        all_gt2local = []
        all_local2gt = []
        for idx in range(bs):
            labels = []
            lines = []
            gt2local = []
            local2gt = {}
            for label, _lines in vectors[idx].items():
                for _ins_id, _line in enumerate(_lines):
                    labels.append(label)
                    gt2local.append([label, _ins_id])
                    local2gt[(label, _ins_id)] = len(lines)
                    if len(_line.shape) == 3: # permutation
                        num_permute, num_points, coords_dim = _line.shape
                        lines.append(torch.tensor(_line).reshape(num_permute, -1)) # (38, 40)
                    elif len(_line.shape) == 2:
                        lines.append(torch.tensor(_line).reshape(-1)) # (40, )
                    else:
                        assert False

            all_labels_list.append(torch.tensor(labels, dtype=torch.long).to(device))
            all_lines_list.append(torch.stack(lines).float().to(device))
            all_gt2local.append(gt2local)
            all_local2gt.append(local2gt)

        gts = {
            'labels': all_labels_list,
            'lines': all_lines_list,
            'gt2local': all_gt2local,
            'local2gt': all_local2gt,
        }

        gts = [deepcopy(gts) for _ in range(self.num_decoder_layers)]

        return gts, imgs, img_metas, valid_idx, points
    
    def get_two_frame_matching(self, local2global_prev, local2global_curr, gts_prev, gts):
        """
        Get the G.T. matching between the two frames
        Terminology: (1). local --> local idx inside each category;
                    (2). global --> global instance id inside category
                    (3). gt --> index in the flattened G.T. sequence
        Args:
            prev_ins_ids (_type_): global ids (pre-prepared) for prev frame
            curr_ins_ids (_type_): global ids (pre-prepared) for curr frame
            gts_prev (_type_): processed G.T. for prev frame
            gts (_type_): processed G.T. for curr frame
        """
        bs = len(local2global_prev)
        gt2local_curr = gts[-1]['gt2local'] # don't need the per-block supervision, just take one
        gt2local_prev = gts_prev[-1]['gt2local']
        local2gt_prev = gts_prev[-1]['local2gt']

        # the comma is to take the single-element output from multi_apply
        global2local_prev, = multi_apply(self._reverse_id_mapping, local2global_prev)

        all_gt_cur2prev, all_gt_prev2cur = multi_apply(self._compute_cur2prev, gt2local_curr, gt2local_prev, local2gt_prev, 
                                        local2global_curr, global2local_prev)
        
        return all_gt_cur2prev, all_gt_prev2cur
    
    def _compute_cur2prev(self, gt2local_curr, gt2local_prev, local2gt_prev, 
                          local2global_curr, global2local_prev):
        cur2prev = torch.zeros(len(gt2local_curr))
        prev2cur = torch.zeros(len(gt2local_prev))
        prev2cur[:] = -1
        for gt_idx_curr in range(len(gt2local_curr)):
            label = gt2local_curr[gt_idx_curr][0]
            local_idx = gt2local_curr[gt_idx_curr][1]
            seq_id = local2global_curr[label][local_idx]
            if seq_id in global2local_prev[label]:
                local_id_prev = global2local_prev[label][seq_id]
                gt_idx_prev = local2gt_prev[(label, local_id_prev)]
            else:
                gt_idx_prev = -1
            cur2prev[gt_idx_curr] = gt_idx_prev
            if gt_idx_prev != -1: # there is a positive match in prev frame
                prev2cur[gt_idx_prev] = gt_idx_curr # update the information
            
        return cur2prev, prev2cur
                
    def _reverse_id_mapping(self, id_mapping):
        reversed_mapping = {}
        for label, mapping in id_mapping.items():
            r_map = {v:k for k,v in mapping.items()}
            reversed_mapping[label] = r_map
        return reversed_mapping,

    def prepare_track_queries_and_targets(self, gts, prev_inds_list, prev_gt_inds_list, prev_matched_reg_cost,
                     prev_gt_list, prev_out, gt_cur2prev, gt_prev2cur, metas_prev, use_memory, pos_th=0.4, timestep=None):
        bs = len(prev_inds_list)
        device = prev_out['lines'][0].device

        targets = []
        for b_i in range(bs):
            results = {}
            for key, val in gts[-1].items():
                results[key] = val[b_i]
            targets.append(results)
                
        # for each sample in the batch
        for b_i, (target, prev_out_ind, prev_target_ind) in enumerate(zip(targets, prev_inds_list, prev_gt_inds_list)):
            scene_seq_id = metas_prev[b_i]['local_idx']

            scores = prev_out['scores'][b_i].detach()
            scores, labels = scores.max(-1)
            scores = scores.sigmoid()

            match_cost = prev_matched_reg_cost[b_i]
            target_prev2cur = gt_prev2cur[b_i].to(device)
            target['prev_target_ind'] = prev_target_ind # record the matched g.t. index
            target['prev_out_ind'] = prev_out_ind
            target['gt_prev2cur'] = target_prev2cur
            assert len(target_prev2cur) == len(prev_gt_inds_list[b_i])

            # 1). filter the ones with low scores, create FN; 
            prev_pos_scores = scores[prev_out_ind]
            score_filter_mask = prev_pos_scores >= pos_th

            keep_mask = score_filter_mask
            prev_out_ind_filtered = prev_out_ind[keep_mask]
            prev_target_ind_filtered = prev_target_ind[keep_mask]
            
            target_prev2cur = target_prev2cur[prev_target_ind_filtered]
            target_ind_matching = (target_prev2cur != -1) # -1 means no matching g.t. in curr frame
            # matched g.t. index in the current frame
            target_ind_matched_idx = target_prev2cur[target_prev2cur!=-1]

            target['track_query_match_ids'] = target_ind_matched_idx
            
            if timestep == 0:
                pad_bound = self.head.num_queries
            else:
                pad_bound = self.tracked_query_length[b_i] + self.head.num_queries
                
            not_prev_out_ind = torch.arange(prev_out['lines'][b_i].shape[0]).to(device)
            not_prev_out_ind = torch.tensor([
                ind.item()
                for ind in not_prev_out_ind
                if ind not in prev_out_ind and ind < pad_bound])
            
            # Get all non-matched pred with >0.5 conf score, serve as FP
            neg_scores = scores[not_prev_out_ind]
            neg_score_mask = neg_scores >= pos_th
            # Randomly pick 10% neg output instances and serve as FP
            _rand_insert = torch.rand([len(neg_scores)]).to(device)

            if self.track_fp_aug:
                rand_insert_mask = _rand_insert >= 0.95
                fp_select_mask = neg_score_mask | rand_insert_mask
            else:
                fp_select_mask = neg_score_mask

            false_out_ind = not_prev_out_ind[fp_select_mask]

            prev_out_ind_final = torch.tensor(prev_out_ind_filtered.tolist() + false_out_ind.tolist()).long()
            target_ind_matching = torch.cat([
                target_ind_matching,
                torch.tensor([False, ] * len(false_out_ind)).bool().to(device)
            ])

            target_prev2cur_aug = torch.cat([
                target_prev2cur,
                torch.tensor([-1, ] * len(false_out_ind)).to(device)
            ])
            target['track_to_cur_gt_ids'] = target_prev2cur_aug

            # track query masks
            track_queries_mask = torch.ones_like(target_ind_matching).bool()
            track_queries_fal_pos_mask = torch.zeros_like(target_ind_matching).bool()
            track_queries_fal_pos_mask[~target_ind_matching] = True

            # set prev frame info
            target['track_query_hs_embeds'] = prev_out['hs_embeds'][b_i, prev_out_ind_final]
            target['track_query_boxes'] = prev_out['lines'][b_i][prev_out_ind_final].detach()
            tmp_labels = labels[prev_out_ind_final]
            tmp_scores = scores[prev_out_ind_final]
            target['track_query_labels'] = tmp_labels
            target['track_query_scores'] = tmp_scores

            # Prepare the G.T. line coords for the track queries, used in the transformation loss
            prev_gt_lines = prev_gt_list['lines'][b_i] 
            prev_gt_labels = prev_gt_list['labels'][b_i] 
            target['track_query_gt_lines'] = prev_gt_lines[prev_out_ind_final]
            target['track_query_gt_labels'] = prev_gt_labels[prev_out_ind_final]

            target['track_queries_mask'] = torch.cat([
                track_queries_mask,
                torch.tensor([False, ] * self.head.num_queries).to(device)
            ]).bool()

            target['track_queries_fal_pos_mask'] = torch.cat([
                track_queries_fal_pos_mask,
                torch.tensor([False, ] * self.head.num_queries).to(device)
            ]).bool()

            if use_memory:
                is_first_frame = (timestep == 0)
                num_tracks = 0 if timestep == 0 else self.tracked_query_length[b_i]
                self.memory_bank.update_memory(b_i, is_first_frame, prev_out_ind_final, prev_out, num_tracks, scene_seq_id, timestep)
        
        targets = self._batchify_tracks(targets)
        return targets
    
    def _batchify_tracks(self, targets):
        lengths = [len(t['track_queries_mask']) for t in targets]
        max_len = max(lengths)
        device = targets[0]['track_query_hs_embeds'].device
        for b_i in range(len(lengths)):
            target = targets[b_i]
            padding_len = max_len - lengths[b_i]
            pad_hs_embeds = torch.zeros([padding_len, target['track_query_hs_embeds'].shape[1]]).to(device)
            pad_query_boxes = torch.zeros([padding_len, target['track_query_boxes'].shape[1]]).to(device)
            query_padding_mask = torch.zeros([max_len]).bool().to(device)
            query_padding_mask[lengths[b_i]:] = True
            target['pad_hs_embeds'] = pad_hs_embeds
            target['pad_query_boxes'] = pad_query_boxes
            target['query_padding_mask'] = query_padding_mask
            self.tracked_query_length[b_i] = lengths[b_i] - self.head.num_queries
        return targets
        
    def train(self, *args, **kwargs):
        super().train(*args, **kwargs)
        if self.freeze_bev:
            self._freeze_bev()
        elif self.freeze_bev_iters is not None and self.num_iter < self.freeze_bev_iters:
            self._freeze_bev()
        else:
            self._unfreeze_bev()

    def eval(self):
        super().eval()
        
    def _freeze_bev(self,):
        """Freeze all bev-related backbone parameters, including the backbone and the seg head
        """
        for param in self.backbone.parameters():
            param.requires_grad = False
        for param in self.seg_decoder.parameters():
            param.requires_grad = False
    
    def _unfreeze_bev(self,):
        """unfreeze all bev-related backbone parameters, including the backbone and the seg head
        """
        for param in self.backbone.parameters():
            param.requires_grad = True
        for param in self.seg_decoder.parameters():
            param.requires_grad = True
    
    def _denorm_lines(self, line_pts):
        """from (0,1) to the BEV space in meters"""
        line_pts[..., 0] = line_pts[..., 0] * self.roi_size[0] \
                        - self.roi_size[0] / 2 
        line_pts[..., 1] = line_pts[..., 1] * self.roi_size[1] \
                        - self.roi_size[1] / 2 
        return line_pts

    def _norm_lines(self, line_pts):
        """from the BEV space in meters to (0,1) """
        line_pts[..., 0] = (line_pts[..., 0] + self.roi_size[0] / 2) \
                                        / self.roi_size[0] 
        line_pts[..., 1] = (line_pts[..., 1] + self.roi_size[1] / 2) \
                                        / self.roi_size[1] 
        return line_pts

    def _process_track_query_info(self, track_info):
        bs = len(track_info)
        all_scores = []
        all_lines = []
        for b_i in range(bs):
            embeds = track_info[b_i]['track_query_hs_embeds']
            scores = self.head.cls_branches[-1](embeds)
            coords = self.head.reg_branches[-1](embeds).sigmoid()
            coords = rearrange(coords, 'n1 (n2 n3) -> n1 n2 n3', n3=2)
            all_scores.append(scores)
            all_lines.append(coords)
        track_results = {
            'lines': all_lines,
            'scores': all_scores,
        }
        return track_results
    
    def select_memory_entries(self, history_metas, curr_meta):
        """
        Only used at test time, to select a subset from the long history bank
        """
        if len(history_metas) <= self.history_steps:
            return np.arange(len(history_metas))
        else:
            history_e2g_trans = np.array([item[0]['ego2global_translation'] for item in history_metas])[:, :2]
            curr_e2g_trans = np.array(curr_meta[0]['ego2global_translation'])[:2]
            dists = np.linalg.norm(history_e2g_trans - curr_e2g_trans[None, :], axis=1)

            sorted_indices = np.argsort(dists)
            sorted_dists = dists[sorted_indices]
            covered = np.zeros_like(sorted_indices).astype(np.bool)
            selected_ids = []
            for dist_range in self.mem_select_dist_ranges[::-1]:
                outter_valid_flags = (sorted_dists >= dist_range) & ~covered
                if outter_valid_flags.any():
                    pick_id = np.where(outter_valid_flags)[0][0]     
                    covered[pick_id:] = True
                else:
                    inner_valid_flags = (sorted_dists < dist_range) & ~covered
                    if inner_valid_flags.any():
                        pick_id = np.where(inner_valid_flags)[0][-1]
                        covered[pick_id] = True
                    else:
                        return np.arange(len(history_metas))[-4:]
                selected_ids.append(pick_id)

            selected_mem_ids = sorted_indices[np.array(selected_ids)]

            return selected_mem_ids

    #####################################################################
    # 
    # Debugging visualization of the temporal propagation supervision
    # 
    ##################################################################### 

    def _viz_temporal_supervision(self, outputs_prev, all_track_info, gts, gts_prev, semantic_mask, 
                                  semantic_mask_prev, img_metas, img_metas_prev, timestep):
        """For debugging use: draw the visualization of the track queries and the corresponding
        matched G.T. information..."""
        import os
        from ..utils.renderer_track import Renderer
        viz_dir = './viz/debug_noisy_trans'
        if not os.path.exists(viz_dir):
            os.makedirs(viz_dir)
        cat2id = {
            'ped_crossing': 0,
            'divider': 1,
            'boundary': 2,
        }
        renderer = Renderer(cat2id, self.roi_size, 'nusc')

        for b_i in range(len(all_track_info)):
            track_info = all_track_info[b_i]
            # prev pred info
            prev_pred_lines = outputs_prev['lines'][b_i]
            prev_pred_scores = outputs_prev['scores'][b_i]
            prev_target_inds = track_info['prev_target_ind']
            prev_out_inds = track_info['prev_out_ind']
            gt_prev2cur = track_info['gt_prev2cur']
            prev_scores, prev_labels = prev_pred_scores.max(-1)
            prev_scores = prev_scores.sigmoid()
            prev_lines = rearrange(prev_pred_lines[prev_out_inds], 'n (k c) -> n k c', c=2)
            prev_labels = prev_labels[prev_out_inds]
            prev_lines = self._denorm_lines(prev_lines)
            prev_scores = prev_scores[prev_out_inds]
            out_path_prev = os.path.join(viz_dir, f't={timestep}_{b_i}_prev.png')
            renderer.render_bev_from_vectors(prev_lines, prev_labels, out_path_prev, 
                id_info=prev_target_inds, score_info=prev_scores)

            # gt info
            gt_labels = gts['labels'][b_i]
            gt_lines = torch.clip(gts['lines'][b_i][:, 0], 0, 1)
            gt_lines = rearrange(gt_lines, 'n (k c) -> n k c', c=2)
            gt_lines = self._denorm_lines(gt_lines)
            out_path_gt = os.path.join(viz_dir, f't={timestep}_{b_i}_gt.png')
            gt_ids = np.arange(len(gt_lines))
            renderer.render_bev_from_vectors(gt_lines, gt_labels, out_path_gt, id_info=gt_ids)
            gt_semantic = semantic_mask[b_i].cpu().numpy()
            out_path_gt_semantic = os.path.join(viz_dir, f't={timestep}_{b_i}_gt_semantic.png')
            renderer.render_bev_from_mask(gt_semantic, out_path_gt_semantic)

            # gt info for prev frame
            gt_labels = gts_prev['labels'][b_i]
            gt_lines = torch.clip(gts_prev['lines'][b_i][:, 0], 0, 1)
            gt_lines = rearrange(gt_lines, 'n (k c) -> n k c', c=2)
            gt_lines = self._denorm_lines(gt_lines)
            out_path_gt = os.path.join(viz_dir, f't={timestep}_{b_i}_prev_gt.png')
            gt_ids = np.arange(len(gt_lines))
            renderer.render_bev_from_vectors(gt_lines, gt_labels, out_path_gt, id_info=gt_ids)
            gt_semantic = semantic_mask_prev[b_i].cpu().numpy()
            out_path_gt_semantic = os.path.join(viz_dir, f't={timestep}_{b_i}_prev_gt_semantic.png')
            renderer.render_bev_from_mask(gt_semantic, out_path_gt_semantic)

            # track query info
            track_to_cur_gt_ids = track_info['track_to_cur_gt_ids']
            trans_track_lines = track_info['trans_track_query_boxes']
            trans_track_lines = rearrange(trans_track_lines, 'n (k c) -> n k c', c=2)
            trans_track_lines = self._denorm_lines(trans_track_lines)
            #tp_track_mask = ~track_info['track_queries_fal_pos_mask'][:-100]
            trans_track_lines = trans_track_lines
            track_labels = track_info['track_query_labels']
            track_scores = track_info['track_query_scores']
            out_path_track = os.path.join(viz_dir, f't={timestep}_{b_i}_track.png')
            renderer.render_bev_from_vectors(trans_track_lines, track_labels, out_path_track, 
                id_info=track_to_cur_gt_ids, score_info=track_scores)


================================================
FILE: plugin/models/mapers/__init__.py
================================================
from .MapTracker import MapTracker

================================================
FILE: plugin/models/mapers/base_mapper.py
================================================
from abc import ABCMeta, abstractmethod

import torch.nn as nn
from mmcv.runner import auto_fp16
from mmcv.utils import print_log

from mmdet.utils import get_root_logger
from mmdet3d.models.builder import DETECTORS

MAPPERS = DETECTORS

class BaseMapper(nn.Module, metaclass=ABCMeta):
    """Base class for mappers."""

    def __init__(self):
        super(BaseMapper, self).__init__()
        self.fp16_enabled = False

    @property
    def with_neck(self):
        """bool: whether the detector has a neck"""
        return hasattr(self, 'neck') and self.neck is not None

    # TODO: these properties need to be carefully handled
    # for both single stage & two stage detectors
    @property
    def with_shared_head(self):
        """bool: whether the detector has a shared head in the RoI Head"""
        return hasattr(self, 'roi_head') and self.roi_head.with_shared_head

    @property
    def with_bbox(self):
        """bool: whether the detector has a bbox head"""
        return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
                or (hasattr(self, 'bbox_head') and self.bbox_head is not None))

    @property
    def with_mask(self):
        """bool: whether the detector has a mask head"""
        return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
                or (hasattr(self, 'mask_head') and self.mask_head is not None))

    #@abstractmethod
    def extract_feat(self, imgs):
        """Extract features from images."""
        pass

    def forward_train(self, *args, **kwargs):
        pass

    #@abstractmethod
    def simple_test(self, img, img_metas, **kwargs):
        pass

    #@abstractmethod
    def aug_test(self, imgs, img_metas, **kwargs):
        """Test function with test time augmentation."""
        pass

    def init_weights(self, pretrained=None):
        """Initialize the weights in detector.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        if pretrained is not None:
            logger = get_root_logger()
            print_log(f'load model from: {pretrained}', logger=logger)

    def forward_test(self, *args, **kwargs):
        """
        Args:
        """
        if True:
            self.simple_test()
        else:
            self.aug_test()

    # @auto_fp16(apply_to=('img', ))
    def forward(self, *args, return_loss=True, **kwargs):
        """Calls either :func:`forward_train` or :func:`forward_test` depending
        on whether ``return_loss`` is ``True``.

        Note this setting will change the expected inputs. When
        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
        and List[dict]), and when ``resturn_loss=False``, img and img_meta
        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
        the outer list indicating test time augmentations.
        """
        if return_loss:
            return self.forward_train(*args, **kwargs)
        else:
            kwargs.pop('rescale')
            return self.forward_test(*args, **kwargs)

    def train_step(self, data_dict, optimizer):
        """The iteration step during training.

        This method defines an iteration step during training, except for the
        back propagation and optimizer updating, which are done in an optimizer
        hook. Note that in some complicated cases or models, the whole process
        including back propagation and optimizer updating is also defined in
        this method, such as GAN.

        Args:
            data_dict (dict): The output of dataloader.
            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
                runner is passed to ``train_step()``. This argument is unused
                and reserved.

        Returns:
            dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \
                ``num_samples``.

                - ``loss`` is a tensor for back propagation, which can be a \
                weighted sum of multiple losses.
                - ``log_vars`` contains all the variables to be sent to the
                logger.
                - ``num_samples`` indicates the batch size (when the model is \
                DDP, it means the batch size on each GPU), which is used for \
                averaging the logs.
        """
        loss, log_vars, num_samples = self(**data_dict)
        
        outputs = dict(
            loss=loss, log_vars=log_vars, num_samples=num_samples)

        return outputs

    def val_step(self, data, optimizer):
        """The iteration step during validation.

        This method shares the same signature as :func:`train_step`, but used
        during val epochs. Note that the evaluation after training epochs is
        not implemented with this method, but an evaluation hook.
        """
        loss, log_vars, num_samples = self(**data)
        
        outputs = dict(
            loss=loss, log_vars=log_vars, num_samples=num_samples)

        return outputs

    def show_result(self,
                    **kwargs):
        img = None
        return img

================================================
FILE: plugin/models/mapers/vector_memory.py
================================================
import torch
from torch import nn

from einops import repeat, rearrange
from scipy.spatial.transform import Rotation as R
import numpy as np


def get_emb(sin_inp):
    """
    Gets a base embedding for one dimension with sin and cos intertwined
    """
    emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)
    return torch.flatten(emb, -2, -1)


class PositionalEncoding1D(nn.Module):
    def __init__(self, channels):
        """
        :param channels: The last dimension of the tensor you want to apply pos emb to.
        """
        super(PositionalEncoding1D, self).__init__()
        self.org_channels = channels
        channels = int(np.ceil(channels / 2) * 2)
        self.channels = channels
        inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
        self.register_buffer("inv_freq", inv_freq)
        self.register_buffer("cached_penc", None)

    def forward(self, tensor):
        """
        :param tensor: A 3d tensor of size (batch_size, x, ch)
        :return: Positional Encoding Matrix of size (batch_size, x, ch)
        """
        if len(tensor.shape) != 3:
            raise RuntimeError("The input tensor has to be 3d!")

        if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
            return self.cached_penc

        self.cached_penc = None
        batch_size, x, orig_ch = tensor.shape
        pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
        sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
        emb_x = get_emb(sin_inp_x)
        emb = torch.zeros((x, self.channels), device=tensor.device).type(tensor.type())
        emb[:, : self.channels] = emb_x

        self.cached_penc = emb[None, :, :orig_ch].repeat(batch_size, 1, 1)
        return self.cached_penc


class VectorInstanceMemory(nn.Module):

    def __init__(self,
                 dim_in, number_ins, bank_size, mem_len, mem_select_dist_ranges
                 ):
        super().__init__()
        self.max_number_ins = 3 * number_ins # make sure this is not exceeded at initial training when results could be quite random
        self.bank_size = bank_size
        self.mem_len = mem_len
        self.dim_in = dim_in
        self.mem_select_dist_ranges = mem_select_dist_ranges

        p_enc_1d = PositionalEncoding1D(dim_in)
        fake_tensor = torch.zeros((1, 1000, dim_in)) # suppose all sequences are shorter than 1000
        self.cached_pe = p_enc_1d(fake_tensor)[0]

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def set_bank_size(self, bank_size):
        self.bank_size = bank_size

    def init_memory(self, bs):
        self.mem_bank = torch.zeros((self.bank_size, bs, self.max_number_ins, self.dim_in), dtype=torch.float32).cuda()
        self.mem_bank_seq_id = torch.zeros((self.bank_size, bs, self.max_number_ins), dtype=torch.long).cuda()
        self.mem_bank_trans = torch.zeros((self.bank_size, bs,  3),dtype=torch.float32).cuda()
        self.mem_bank_rot = torch.zeros((self.bank_size, bs, 3, 3),dtype=torch.float32).cuda()
        self.batch_mem_embeds_dict = {}
        self.batch_mem_relative_pe_dict = {}
        self.batch_key_padding_dict = {}
        self.curr_rot = torch.zeros((bs,3,3),dtype=torch.float32).cuda()
        self.curr_trans = torch.zeros((bs,3),dtype=torch.float32).cuda()
        self.gt_lines_info = {}

        # memory recording information
        self.instance2mem = [{} for _ in range(bs)]
        self.num_ins = [0 for _ in range(bs)]
        self.active_mem_ids = [None for _ in range(bs)]
        self.valid_track_idx = [None for _ in range(bs)]
        self.random_bev_masks = [None for _ in range(bs)]
        init_entry_length = torch.tensor([0]*self.max_number_ins).long()
        self.mem_entry_lengths = [init_entry_length.clone() for _ in range(bs)]

    def update_memory(self, batch_i, is_first_frame, propagated_ids, prev_out, num_tracks, 
                      seq_idx, timestep):
        if is_first_frame:
            mem_instance_ids = torch.arange(propagated_ids.shape[0])
            track2mem_info = {i: i for i in range(len(propagated_ids))}
            num_instances = len(propagated_ids)
        else:
            track2mem_info_prev = self.instance2mem[batch_i]
            track2mem_info = {}
            num_instances = self.num_ins[batch_i]
            for pred_i, propagated_id in enumerate(propagated_ids):
                if propagated_id < num_tracks: # existing tracks
                    track2mem_info[pred_i] = track2mem_info_prev[propagated_id.item()]
                else: # newborn instances
                    track2mem_info[pred_i] = num_instances
                    num_instances += 1
            mem_instance_ids = torch.tensor([track2mem_info[item] for item in range(len(propagated_ids))]).long()
        
        assert num_instances < self.max_number_ins, 'Number of instances larger than mem size!'

        #NOTE: put information into the memory, need to detach the scores to block gradient backprop 
        # from future time steps
        prev_embeddings = prev_out['hs_embeds'][batch_i]
        prev_scores = prev_out['scores'][batch_i]
        prev_scores, prev_labels = prev_scores.max(-1)
        prev_scores = prev_scores.sigmoid().detach()
        
        mem_lens_per_ins = self.mem_entry_lengths[batch_i][mem_instance_ids]

        # insert information into mem bank
        for ins_idx, mem_id in enumerate(mem_instance_ids):
            if mem_lens_per_ins[ins_idx] < self.bank_size:
                self.mem_bank[mem_lens_per_ins[ins_idx], batch_i, mem_id] = prev_embeddings[propagated_ids[ins_idx]]
                self.mem_bank_seq_id[mem_lens_per_ins[ins_idx], batch_i, mem_id] = seq_idx
            else:
                self.mem_bank[:self.bank_size-1, batch_i, mem_id] = self.mem_bank[1:self.bank_size, batch_i, mem_id]
                self.mem_bank[-1, batch_i, mem_id] = prev_embeddings[propagated_ids[ins_idx]]
                self.mem_bank_seq_id[:self.bank_size-1, batch_i, mem_id] = self.mem_bank_seq_id[1:self.bank_size, batch_i, mem_id]
                self.mem_bank_seq_id[-1, batch_i, mem_id] = seq_idx

        if self.curr_t < self.bank_size:
            self.mem_bank_rot[self.curr_t, batch_i] = self.curr_rot[batch_i]
            self.mem_bank_trans[self.curr_t, batch_i] = self.curr_trans[batch_i]
        else:
            self.mem_bank_rot[:self.bank_size-1, batch_i] = self.mem_bank_rot[1:, batch_i].clone()
            self.mem_bank_rot[-1, batch_i] = self.curr_rot[batch_i]
            self.mem_bank_trans[:self.bank_size-1, batch_i] = self.mem_bank_trans[1:, batch_i].clone()
            self.mem_bank_trans[-1, batch_i] = self.curr_trans[batch_i]

        # Update the mem recording information
        self.instance2mem[batch_i] = track2mem_info
        self.num_ins[batch_i] = num_instances
        self.mem_entry_lengths[batch_i][mem_instance_ids] += 1
        self.active_mem_ids[batch_i] = mem_instance_ids.long().to(propagated_ids.device)
        active_mem_entry_lens = self.mem_entry_lengths[batch_i][self.active_mem_ids[batch_i]]
        self.valid_track_idx[batch_i] = torch.where(active_mem_entry_lens >= 1)[0]

        #print('Active memory ids:', self.active_mem_ids[batch_i])
        #print('Memory entry lens:', active_mem_entry_lens)
        #print('Valid track idx:', self.valid_track_idx[batch_i])

    def prepare_transformation_batch(self,history_e2g_trans,history_e2g_rot,curr_e2g_trans,curr_e2g_rot):
        history_g2e_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0)
        history_g2e_matrix[:, :3, :3] = torch.transpose(history_e2g_rot, 1, 2)
        history_g2e_matrix[:, :3, 3] = -torch.bmm(torch.transpose(history_e2g_rot, 1, 2), history_e2g_trans[..., None]).squeeze(-1)

        curr_g2e_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device)
        curr_g2e_matrix[:3, :3] = curr_e2g_rot.T
        curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans)

        curr_e2g_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device)
        curr_e2g_matrix[:3, :3] = curr_e2g_rot
        curr_e2g_matrix[:3, 3] = curr_e2g_trans

        history_e2g_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0)
        history_e2g_matrix[:, :3, :3] = history_e2g_rot
        history_e2g_matrix[:, :3, 3] = history_e2g_trans

        history_curr2prev_matrix = torch.bmm(history_g2e_matrix, repeat(curr_e2g_matrix,'n1 n2 -> r n1 n2', r=len(history_g2e_matrix)))
        history_prev2curr_matrix = torch.bmm(repeat(curr_g2e_matrix, 'n1 n2 -> r n1 n2', r=len(history_e2g_matrix)), history_e2g_matrix)
        
        return history_curr2prev_matrix, history_prev2curr_matrix

    def clear_dict(self,):
        self.batch_mem_embeds_dict = {}
        self.batch_mem_relative_pe_dict = {}
        self.batch_key_padding_dict = {}

    def trans_memory_bank(self, query_prop, b_i, metas):
        seq_id = metas['local_idx']
        
        active_mem_ids = self.active_mem_ids[b_i]
        mem_entry_lens = self.mem_entry_lengths[b_i][active_mem_ids]
        num_track_ins = len(active_mem_ids)
        valid_mem_len = min(self.curr_t, self.mem_len)
        valid_bank_size = min(self.curr_t, self.bank_size)
        mem_trans = self.mem_bank_trans[:, b_i]
        mem_rots = self.mem_bank_rot[:, b_i]

        if self.training:
            # Note: at training time, bank_size must be the same as mem_len, no selection needed
            assert self.mem_len == self.bank_size, 'at training time, bank_size must be the same as mem_len'
            mem_embeds = self.mem_bank[:, b_i, active_mem_ids]
            mem_seq_ids = self.mem_bank_seq_id[:, b_i, active_mem_ids]
        else:
            # at test time, the bank size can be much longer, and we need the selection strategy
            mem_embeds = torch.zeros_like(self.mem_bank[:self.mem_len, b_i, active_mem_ids])
            mem_seq_ids = torch.zeros_like(self.mem_bank_seq_id[:self.mem_len, b_i, active_mem_ids])

        # Put information into mem embeddings and pos_ids, prepare for attention-fusion
        # Also prepare the pose information for the query propagation
        all_pose_select_indices = []
        all_select_indices = []
        for idx, active_idx in enumerate(active_mem_ids):
            effective_len = mem_entry_lens[idx]
            valid_mem_trans = mem_trans[:valid_bank_size]
            trunc_eff_len = min(effective_len, self.bank_size)
            valid_pose_ids = torch.arange(valid_bank_size-trunc_eff_len, valid_bank_size)
            #print('ins {}, valid pose ids {}'.format(idx, valid_pose_ids))
            if effective_len <= self.mem_len:
                select_indices = torch.arange(effective_len)
            else:
                select_indices = self.select_memory_entries(valid_mem_trans[-trunc_eff_len:], metas)
            pose_select_indices = valid_pose_ids[select_indices]
            mem_embeds[:len(select_indices), idx] = self.mem_bank[select_indices, b_i, active_idx]
            mem_seq_ids[:len(select_indices), idx] = self.mem_bank_seq_id[select_indices, b_i, active_idx]
            all_pose_select_indices.append(pose_select_indices)
            all_select_indices.append(select_indices)
        
        # prepare mem padding mask
        key_padding_mask = torch.ones((self.mem_len, num_track_ins)).bool().cuda()
        padding_trunc_loc = torch.clip(mem_entry_lens, max=self.mem_len)
        for ins_i in range(num_track_ins):
            key_padding_mask[:padding_trunc_loc[ins_i], ins_i] = False
        key_padding_mask = key_padding_mask.T

        # prepare relative seq idx gap
        relative_seq_idx = torch.zeros_like(mem_embeds[:,:,0]).long()
        relative_seq_idx[:valid_mem_len] = seq_id - mem_seq_ids[:valid_mem_len]
        relative_seq_pe = self.cached_pe[relative_seq_idx].to(mem_embeds.device)

        # prepare relative pose information for each active instance
        curr2prev_matrix, prev2curr_matrix = self.prepare_transformation_batch(mem_trans[:valid_bank_size],
            mem_rots[:valid_bank_size], self.curr_trans[b_i], self.curr_rot[b_i])
        pose_matrix = prev2curr_matrix.float()[:,:3]
        rot_mat = pose_matrix[..., :3].cpu().numpy()
        rot = R.from_matrix(rot_mat)
        translation = pose_matrix[..., 3] 

        if self.training:
            rot, translation = self.add_noise_to_pose(rot, translation)

        rot_quat = torch.tensor(rot.as_quat()).float().to(pose_matrix.device)
        pose_info = torch.cat([rot_quat, translation], dim=1)
        pose_info_per_ins = torch.zeros((valid_mem_len, num_track_ins, pose_info.shape[1])).to(pose_info.device)

        for ins_idx in range(num_track_ins):
            pose_select_indices = all_pose_select_indices[ins_idx]
            pose_info_per_ins[:len(pose_select_indices), ins_idx] = pose_info[pose_select_indices]

        mem_embeds_new = mem_embeds.clone()
        mem_embeds_valid = rearrange(mem_embeds[:valid_mem_len], 't n c -> (t n) c')
        pose_info_per_ins = rearrange(pose_info_per_ins, 't n c -> (t n) c')
        mem_embeds_prop = query_prop(
            mem_embeds_valid,
            pose_info_per_ins
        )
        mem_embeds_new[:valid_mem_len] = rearrange(mem_embeds_prop, '(t n) c -> t n c', t=valid_mem_len)

        self.batch_mem_embeds_dict[b_i] = mem_embeds_new.clone().detach()
        self.batch_mem_relative_pe_dict[b_i] = relative_seq_pe
        self.batch_key_padding_dict[b_i] = key_padding_mask
    
    def add_noise_to_pose(self, rot, trans):
        rot_euler = rot.as_euler('zxy')
        # 0.08 mean is around 5-degree, 3-sigma is 15-degree
        noise_euler = np.random.randn(*list(rot_euler.shape)) * 0.08
        rot_euler += noise_euler
        noisy_rot = R.from_euler('zxy', rot_euler)

        # error within 0.25 meter
        noise_trans = torch.randn_like(trans) * 0.25
        noise_trans[:, 2] = 0
        noisy_trans = trans + noise_trans

        return noisy_rot, noisy_trans

    def select_memory_entries(self, mem_trans, curr_meta):
        history_e2g_trans = mem_trans[:, :2].cpu().numpy()
        curr_e2g_trans = np.array(curr_meta['ego2global_translation'][:2])
        dists = np.linalg.norm(history_e2g_trans - curr_e2g_trans[None, :], axis=1)

        sorted_indices = np.argsort(dists)
        sorted_dists = dists[sorted_indices]
        covered = np.zeros_like(sorted_indices).astype(np.bool)
        selected_ids = []
        for dist_range in self.mem_select_dist_ranges[::-1]:
            outter_valid_flags = (sorted_dists >= dist_range) & ~covered
            if outter_valid_flags.any():
                pick_id = np.where(outter_valid_flags)[0][0]     
                covered[pick_id:] = True
            else:
                inner_valid_flags = (sorted_dists < dist_range) & ~covered
                if inner_valid_flags.any():
                    pick_id = np.where(inner_valid_flags)[0][-1]
                    covered[pick_id] = True
                else:
                    # return the mem_len closest one, but in the order of far -> close
                    return np.array(sorted_indices[:4][::-1])
            selected_ids.append(pick_id)

        selected_mem_ids = sorted_indices[np.array(selected_ids)]
        return selected_mem_ids
        
    
================================================
FILE: plugin/models/necks/__init__.py
================================================
from .gru import ConvGRU

================================================
FILE: plugin/models/necks/gru.py
================================================
import torch
import torch.nn as nn
from mmdet.models import NECKS
from mmcv.cnn.utils import kaiming_init, constant_init


@NECKS.register_module()
class ConvGRU(nn.Module):
    def __init__(self, out_channels):
        super(ConvGRU, self).__init__()
        kernel_size = 1
        padding = kernel_size // 2
        self.convz = nn.Conv2d(2*out_channels, 
            out_channels, kernel_size=kernel_size, padding=padding, bias=False)
        self.convr = nn.Conv2d(2*out_channels, 
            out_channels, kernel_size=kernel_size, padding=padding, bias=False)
        self.convq = nn.Conv2d(2*out_channels, 
            out_channels, kernel_size=kernel_size, padding=padding, bias=False)
        self.ln = nn.LayerNorm(out_channels)
        self.zero_out = nn.Conv2d(out_channels, out_channels, 1, 1, bias=True)
        

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                kaiming_init(m)
        nn.init.zeros_(self.zero_out.weight)
        nn.init.zeros_(self.zero_out.bias)
        
    def forward(self, h, x):
        if len(h.shape) == 3:
            h = h.unsqueeze(0)
        if len(x.shape) == 3:
            x = x.unsqueeze(0)
        
        hx = torch.cat([h, x], dim=1) # [1, 2c, h, w]
        z = torch.sigmoid(self.convz(hx))
        r = torch.sigmoid(self.convr(hx))
        new_x = torch.cat([r * h, x], dim=1) # [1, 2c, h, w]
        q = self.convq(new_x)

        out = ((1 - z) * h + z * q) # (1, C, H, W)
        out = self.ln(out.permute(0, 2, 3, 1)).permute(0, 3, 1, 2).contiguous()
        out = self.zero_out(out)
        out = out + x
        out = out.squeeze(0)

        return out


================================================
FILE: plugin/models/transformer_utils/CustomMSDeformableAttention.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------

from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
import mmcv
import cv2 as cv
import copy
import warnings
from matplotlib import pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import xavier_init, constant_init
from mmcv.cnn.bricks.registry import (ATTENTION,
                                      TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import TransformerLayerSequence
import math
from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
                        to_2tuple)

from mmcv.utils import ext_loader
from mmcv.ops.multi_scale_deform_attn import (MultiScaleDeformableAttnFunction,
                                              multi_scale_deformable_attn_pytorch)
from .fp16_dattn import MultiScaleDeformableAttnFunctionFp32

@ATTENTION.register_module()
class CustomMSDeformableAttention(BaseModule):
    """An attention module used in Deformable-Detr.

    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
    <https://arxiv.org/pdf/2010.04159.pdf>`_.

    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_heads (int): Parallel attention heads. Default: 64.
        num_levels (int): The number of feature map used in
            Attention. Default: 4.
        num_points (int): The number of sampling points for
            each query in each head. Default: 4.
        im2col_step (int): The step used in image_to_column.
            Default: 64.
        dropout (float): A Dropout layer on `inp_identity`.
            Default: 0.1.
        batch_first (bool): Key, Query and Value are shape of
            (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
        norm_cfg (dict): Config dict for normalization layer.
            Default: None.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=4,
                 im2col_step=64,
                 dropout=0.1,
                 use_sampling_offsets=True,
                 batch_first=False,
                 norm_cfg=None,
                 init_cfg=None):
        super().__init__(init_cfg)
        if embed_dims % num_heads != 0:
            raise ValueError(f'embed_dims must be divisible by num_heads, '
                             f'but got {embed_dims} and {num_heads}')
        dim_per_head = embed_dims // num_heads
        self.norm_cfg = norm_cfg
        self.dropout = nn.Dropout(dropout)
        self.batch_first = batch_first
        self.fp16_enabled = False

        # you'd better set dim_per_head to a power of 2
        # which is more efficient in the CUDA implementation
        def _is_power_of_2(n):
            if (not isinstance(n, int)) or (n < 0):
                raise ValueError(
                    'invalid input for _is_power_of_2: {} (type: {})'.format(
                        n, type(n)))
            return (n & (n - 1) == 0) and n != 0

        if not _is_power_of_2(dim_per_head):
            warnings.warn(
                "You'd better set embed_dims in "
                'MultiScaleDeformAttention to make '
                'the dimension of each attention head a power of 2 '
                'which is more efficient in our CUDA implementation.')

        self.im2col_step = im2col_step
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_heads = num_heads
        self.num_points = num_points
        self.use_sampling_offsets = use_sampling_offsets
        if use_sampling_offsets:
            self.sampling_offsets = nn.Linear(
                embed_dims, num_heads * num_levels * num_points * 2)
        self.attention_weights = nn.Linear(embed_dims,
                                           num_heads * num_levels * num_points)
        self.value_proj = nn.Linear(embed_dims, embed_dims)
        self.output_proj = nn.Linear(embed_dims, embed_dims)
        self.init_weights()

    def init_weights(self):
        """Default initialization for Parameters of Module."""
        if self.use_sampling_offsets:
            constant_init(self.sampling_offsets, 0.)
            thetas = torch.arange(
                self.num_heads,
                dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
            grid_init = (grid_init /
                        grid_init.abs().max(-1, keepdim=True)[0]).view(
                self.num_heads, 1, 1,
                2).repeat(1, self.num_levels, self.num_points, 1)
            for i in range(self.num_points):
                grid_init[:, :, i, :] *= i + 1

            self.sampling_offsets.bias.data = grid_init.view(-1)
        constant_init(self.attention_weights, val=0., bias=0.)
        xavier_init(self.value_proj, distribution='uniform', bias=0.)
        xavier_init(self.output_proj, distribution='uniform', bias=0.)
        self._is_init = True

    @deprecated_api_warning({'residual': 'identity'},
                            cls_name='MultiScaleDeformableAttention')
    def forward(self,
                query,
                key=None,
                value=None,
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=None,
                spatial_shapes=None,
                level_start_index=None,
                flag='decoder',
                **kwargs):
        """Forward Function of MultiScaleDeformAttention.

        Args:
            query (Tensor): Query of Transformer with shape
                (num_query, bs, embed_dims).
            key (Tensor): The key tensor with shape
                `(num_key, bs, embed_dims)`.
            value (Tensor): The value tensor with shape
                `(num_key, bs, embed_dims)`.
            identity (Tensor): The tensor used for addition, with the
                same shape as `query`. Default None. If None,
                `query` will be used.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`. Default
                None.
            reference_points (Tensor):  The normalized reference
                points with shape (bs, num_query, num_levels, num_points, 2),
                all elements is range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            spatial_shapes (Tensor): Spatial shape of features in
                different levels. With shape (num_levels, 2),
                last dimension represents (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape ``(num_levels, )`` and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].

        Returns:
             Tensor: forwarded results with shape [num_query, bs, embed_dims].
        """

        if value is None:
            value = query

        if identity is None:
            identity = query
        if query_pos is not None:
            query = query + query_pos
        if not self.batch_first:
            # change to (bs, num_query ,embed_dims)
            query = query.permute(1, 0, 2)
            value = value.permute(1, 0, 2)

        bs, num_query, _ = query.shape
        bs, num_value, _ = value.shape
        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value

        value = self.value_proj(value)
        if key_padding_mask is not None:
            value = value.masked_fill(key_padding_mask[..., None], 0.0)
        value = value.view(bs, num_value, self.num_heads, -1)

        if self.use_sampling_offsets:
            sampling_offsets = self.sampling_offsets(query).view(
                bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
        else:
            sampling_offsets = query.new_zeros((bs, num_query, self.num_heads, self.num_levels, self.num_points, 2))
        
        attention_weights = self.attention_weights(query).view(
            bs, num_query, self.num_heads, self.num_levels * self.num_points)
        attention_weights = attention_weights.softmax(-1)

        attention_weights = attention_weights.view(bs, num_query,
                                                   self.num_heads,
                                                   self.num_levels,
                                                   self.num_points)
        
        # TODO: try remove sampling offsets
        offset_normalizer = torch.stack(
            [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) # changed to (h, w)
        _, _, num_points, _ = reference_points.shape
        # (bs, num_queries, num_pts, 2) ->
        # (bs, num_queries, num_heads, num_lvls, num_pts, 2) 
        reference_points = reference_points[:, :, None, None, :, :]
        # reference_points[..., 1:2] = -reference_points[..., 1:2]
        sampling_locations = reference_points + \
            (sampling_offsets # (bs, num_queries, num_heads, num_lvls, num_pts, 2) 
            / offset_normalizer[None, None, None, :, None, :])
        assert list(sampling_locations.shape) == [bs, num_query, self.num_heads, self.num_levels, num_points, 2]
        
        if torch.cuda.is_available() and value.is_cuda:
            # using fp16 deformable attention is unstable because it performs many sum operations
            output = MultiScaleDeformableAttnFunctionFp32.apply(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)
        else:
            output = multi_scale_deformable_attn_pytorch(
                value, spatial_shapes, sampling_locations, attention_weights)

        output = self.output_proj(output)

        if not self.batch_first:
            # (num_query, bs ,embed_dims)
            output = output.permute(1, 0, 2)

        return self.dropout(output) + identity


================================================
FILE: plugin/models/transformer_utils/MapTransformer.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
import warnings
import copy

import torch
import torch.nn as nn
from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init
from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
                                      TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
                                         TransformerLayerSequence,
                                         build_transformer_layer)
from mmcv.runner.base_module import BaseModule, ModuleList

from mmdet.models.utils.builder import TRANSFORMER

from mmdet.models.utils.transformer import Transformer

from .CustomMSDeformableAttention import CustomMSDeformableAttention
from mmdet.models.utils.transformer import inverse_sigmoid

    
@TRANSFORMER_LAYER_SEQUENCE.register_module()
class MapTransformerDecoder_new(BaseModule):
    """Implements the decoder in DETR transformer.
    Args:
        return_intermediate (bool): Whether to return intermediate outputs.
        coder_norm_cfg (dict): Config of last normalization layer. Default:
            `LN`.
    """

    def __init__(self, 
                 transformerlayers=None, 
                 num_layers=None, 
                 prop_add_stage=0,
                 return_intermediate=True,
                 init_cfg=None):
        
        super().__init__(init_cfg)
        if isinstance(transformerlayers, dict):
            transformerlayers = [
                copy.deepcopy(transformerlayers) for _ in range(num_layers)
            ]
        else:
            assert isinstance(transformerlayers, list) and \
                   len(transformerlayers) == num_layers
        self.num_layers = num_layers
        self.layers = ModuleList()
        for i in range(num_layers):
            self.layers.append(build_transformer_layer(transformerlayers[i]))
        self.embed_dims = self.layers[0].embed_dims
        self.pre_norm = self.layers[0].pre_norm
        self.return_intermediate = return_intermediate
        self.prop_add_stage = prop_add_stage
        assert prop_add_stage >= 0  and prop_add_stage < num_layers

    def forward(self,
                query,
                key,
                value,
                query_pos,
                key_padding_mask,
                query_key_padding_mask,
                reference_points,
                spatial_shapes,
                level_start_index,
                reg_branches,
                cls_branches,
                predict_refine,
                memory_bank=None,
                **kwargs):
        """Forward function for `TransformerDecoder`.
        Args:
            query (Tensor): Input query with shape
                `(num_query, bs, embed_dims)`.
            reference_points (Tensor): The reference
                points of offset. has shape (bs, num_query, num_points, 2).
            valid_ratios (Tensor): The radios of valid
                points on the feature map, has shape
                (bs, num_levels, 2)
            reg_branch: (obj:`nn.ModuleList`): Used for
                refining the regression results. Only would
                be passed when with_box_refine is True,
                otherwise would be passed a `None`.
        Returns:
            Tensor: Results with shape [1, num_query, bs, embed_dims] when
                return_intermediate is `False`, otherwise it has shape
                [num_layers, num_query, bs, embed_dims].
        """
        num_queries, bs, embed_dims = query.shape
        output = query
        intermediate = []
        intermediate_reference_points = []

        for lid, layer in enumerate(self.layers):
            tmp = reference_points.clone()
            tmp[..., 1:2] = 1.0 - reference_points[..., 1:2] # reverse y-axis

            output = layer(
                output,
                key,
                value,
                query_pos=query_pos,
                key_padding_mask=key_padding_mask,
                reference_points=tmp,
                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index,
                query_key_padding_mask=query_key_padding_mask,
                memory_bank=memory_bank,
                **kwargs)
            
            reg_points = reg_branches[lid](output.permute(1, 0, 2)) # (bs, num_q, 2*num_points)
            bs, num_queries, num_points2 = reg_points.shape
            reg_points = reg_points.view(bs, num_queries, num_points2//2, 2) # range (0, 1)
            
            if predict_refine:
                new_reference_points = reg_points + inverse_sigmoid(
                    reference_points
                )
                new_reference_points = new_reference_points.sigmoid()
            else:
                new_reference_points = reg_points.sigmoid() # (bs, num_q, num_points, 2)
            
            reference_points = new_reference_points.clone().detach()

            if self.return_intermediate:
                intermediate.append(output.permute(1, 0, 2)) # [(bs, num_q, embed_dims)]
                intermediate_reference_points.append(new_reference_points) # (bs, num_q, num_points, 2)

        if self.return_intermediate:
            return intermediate, intermediate_reference_points

        return output, reference_points

@TRANSFORMER_LAYER.register_module()
class MapTransformerLayer(BaseTransformerLayer):
    """Base `TransformerLayer` for vision transformer.

    It can be built from `mmcv.ConfigDict` and support more flexible
    customization, for example, using any number of `FFN or LN ` and
    use different kinds of `attention` by specifying a list of `ConfigDict`
    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
    when you specifying `norm` as the first element of `operation_order`.
    More details about the `prenorm`: `On Layer Normalization in the
    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .

    Args:
        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for `self_attention` or `cross_attention` modules,
            The order of the configs in the list should be consistent with
            corresponding attentions in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config. Default: None.
        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for FFN, The order of the configs in the list should be
            consistent with corresponding ffn in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config.
        operation_order (tuple[str]): The execution order of operation
            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
            Support `prenorm` when you specifying first element as `norm`.
            Default：None.
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='LN').
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        batch_first (bool): Key, Query and Value are shape
            of (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
    """

    def __init__(self,
                 attn_cfgs=None,
                 ffn_cfgs=dict(
                     type='FFN',
                     embed_dims=256,
                     feedforward_channels=1024,
                     num_fcs=2,
                     ffn_drop=0.,
                     act_cfg=dict(type='ReLU', inplace=True),
                 ),
                 operation_order=None,
                 norm_cfg=dict(type='LN'),
                 init_cfg=None,
                 batch_first=False,
                 **kwargs):

        super().__init__(
            attn_cfgs=attn_cfgs,
            ffn_cfgs=ffn_cfgs,
            operation_order=operation_order,
            norm_cfg=norm_cfg,
            init_cfg=init_cfg,
            batch_first=batch_first,
            **kwargs
        )

    def forward(self,
                query,
                key=None,
                value=None,
                memory_query=None,
                query_pos=None,
                key_pos=None,
                attn_masks=None,
                query_key_padding_mask=None,
                key_padding_mask=None,
                memory_bank=None,
                **kwargs):
        """Forward function for `TransformerDecoderLayer`.

        **kwargs contains some specific arguments of attentions.

        Args:
            query (Tensor): The input query with shape
                [num_queries, bs, embed_dims] if
                self.batch_first is False, else
                [bs, num_queries embed_dims].
            key (Tensor): The key tensor with shape [num_keys, bs,
                embed_dims] if self.batch_first is False, else
                [bs, num_keys, embed_dims] .
            value (Tensor): The value tensor with same shape as `key`.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`.
                Default: None.
            attn_masks (List[Tensor] | None): 2D Tensor used in
                calculation of corresponding attention. The length of
                it should equal to the number of `attention` in
                `operation_order`. Default: None.
            query_key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_queries]. Only used in `self_attn` layer.
                Defaults to None.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_keys]. Default: None.

        Returns:
            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
        """

        if memory_bank is not None:
            bs = query.shape[1]
            all_valid_track_idx = []
            for b_i in range(bs):
                all_valid_track_idx.append(memory_bank.valid_track_idx[b_i])
        
        norm_index = 0
        attn_index = 0
        ffn_index = 0
        identity = query
        if attn_masks is None:
            attn_masks = [None for _ in range(self.num_attn)]
        elif isinstance(attn_masks, torch.Tensor):
            attn_masks = [
                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
            ]
            warnings.warn(f'Use same attn_mask in all attentions in '
                          f'{self.__class__.__name__} ')
        else:
            assert len(attn_masks) == self.num_attn, f'The length of ' \
                        f'attn_masks {len(attn_masks)} must be equal ' \
                        f'to the number of attention in ' \
                        f'operation_order {self.num_attn}'
        
        for layer in self.operation_order:
            if layer == 'self_attn':
                if memory_query is None:
                    temp_key = temp_value = query
                else:
                    temp_key = temp_value = torch.cat([memory_query, query], dim=0)
                
                query = self.attentions[attn_index](
                    query,
                    temp_key,
                    temp_value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=query_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=query_key_padding_mask,
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'norm':
                query = self.norms[norm_index](query)
                norm_index += 1

            elif layer == 'cross_attn':
                if attn_index == 1:
                    query_bev = self.attentions[attn_index](
                        query,
                        key,
                        value,
                        identity if self.pre_norm else None,
                        query_pos=query_pos,
                        key_pos=key_pos,
                        attn_mask=attn_masks[attn_index],
                        key_padding_mask=key_padding_mask,
                        **kwargs)
                    attn_index += 1
                else:
                    # Memory cross attention
                    assert attn_index == 2
                    if memory_bank is not None:
                        bs = query.shape[1]
                        query_i_list = []
                        for b_i in range(bs):
                            valid_track_idx = all_valid_track_idx[b_i] 
                            query_i = query[:, b_i].clone()
                            query_i = query_i[None,:]
                            if len(valid_track_idx) != 0:
                                mem_embeds = memory_bank.batch_mem_embeds_dict[b_i][:, valid_track_idx, :]
                                mem_key_padding_mask = memory_bank.batch_key_padding_dict[b_i][valid_track_idx]
                                mem_key_pos = memory_bank.batch_mem_relative_pe_dict[b_i][:, valid_track_idx]

                                query_i[:, valid_track_idx] = self.attentions[attn_index](
                                        query_i[:,valid_track_idx],
                                        mem_embeds,
                                        mem_embeds,
                                        identity=None,
                                        query_pos=None,
                                        key_pos=mem_key_pos,
                                        attn_mask=None,
                                        key_padding_mask=mem_key_padding_mask,
                                        **kwargs)

                            query_i_list.append(query_i[0])
                        query_memory = torch.stack(query_i_list).permute(1, 0, 2)
                    else:
                        query_memory = torch.zeros_like(query_bev)

                    query = query_memory + query_bev
                    identity = query
                    attn_index += 1

            elif layer == 'ffn':
                query = self.ffns[ffn_index](
                    query, identity if self.pre_norm else None)
                ffn_index += 1

        return query

@TRANSFORMER.register_module()
class MapTransformer(Transformer):
    """Implements the DeformableDETR transformer.
    Args:
        as_two_stage (bool): Generate query from encoder features.
            Default: False.
        num_feature_levels (int): Number of feature maps from FPN:
            Default: 4.
        two_stage_num_proposals (int): Number of proposals when set
            `as_two_stage` as True. Default: 300.
    """

    def __init__(self,
                 num_feature_levels=1,
                 num_points=20,
                 coord_dim=2,
                 **kwargs):
        super().__init__(**kwargs)
        self.num_feature_levels = num_feature_levels
        self.embed_dims = self.encoder.embed_dims
        self.coord_dim = coord_dim
        self.num_points = num_points
        self.init_layers()

    def init_layers(self):
        """Initialize layers of the DeformableDetrTransformer."""
        # self.level_embeds = nn.Parameter(
        #     torch.Tensor(self.num_feature_levels, self.embed_dims))

    def init_weights(self):
        """Initialize the transformer weights."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        for m in self.modules():
            if isinstance(m, CustomMSDeformableAttention):
                m.init_weights()

    def forward(self,
                mlvl_feats,
                mlvl_masks,
                query_embed,
                mlvl_pos_embeds,
                init_reference_points,
                reg_branches=None,
                cls_branches=None,
                memory_query=None,
                memory_bank=None,
                **kwargs):
        """Forward function for `Transformer`.
        Args:
            mlvl_feats (list(Tensor)): Input queries from
                different level. Each element has shape
                [bs, embed_dims, h, w].
            mlvl_masks (list(Tensor)): The key_padding_mask from
                different level used for encoder and decoder,
                each element has shape  [bs, h, w].
            query_embed (Tensor): The query embedding for decoder,
                with shape [num_query, c].
            mlvl_pos_embeds (list(Tensor)): The positional encoding
                of feats from different level, has the shape
                 [bs, embed_dims, h, w].
            reg_branches (obj:`nn.ModuleList`): Regression heads for
                feature maps from each decoder layer. Only would
                be passed when
                `with_box_refine` is True. Default to None.
            cls_branches (obj:`nn.ModuleList`): Classification heads
                for feature maps from each decoder layer. Only would
                 be passed when `as_two_stage`
                 is True. Default to None.
        Returns:
            tuple[Tensor]: results of decoder containing the following tensor.
                - inter_states: Outputs from decoder. If
                    return_intermediate_dec is True output has shape \
                      (num_dec_layers, bs, num_query, embed_dims), else has \
                      shape (1, bs, num_query, embed_dims).
                - init_reference_out: The initial value of reference \
                    points, has shape (bs, num_queries, 4).
                - inter_references_out: The internal value of reference \
                    points in decoder, has shape \
                    (num_dec_layers, bs,num_query, embed_dims)
                - enc_outputs_class: The classification score of \
                    proposals generated from \
                    encoder's feature maps, has shape \
                    (batch, h*w, num_classes). \
                    Only would be returned when `as_two_stage` is True, \
                    otherwise None.
                - enc_outputs_coord_unact: The regression results \
                    generated from encoder's feature maps., has shape \
                    (batch, h*w, 4). Only would \
                    be returned when `as_two_stage` is True, \
                    otherwise None.
        """

        feat_flatten = []
        mask_flatten = []
        # lvl_pos_embed_flatten = []
        spatial_shapes = []
        for lvl, (feat, mask, pos_embed) in enumerate(
                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
            bs, c, h, w = feat.shape
            spatial_shape = (h, w)
            spatial_shapes.append(spatial_shape)
            feat = feat.flatten(2).transpose(1, 2)
            mask = mask.flatten(1)
            feat_flatten.append(feat)
            mask_flatten.append(mask)
        feat_flatten = torch.cat(feat_flatten, 1)
        mask_flatten = torch.cat(mask_flatten, 1)
        spatial_shapes = torch.as_tensor(
            spatial_shapes, dtype=torch.long, device=feat_flatten.device)
        level_start_index = torch.cat((spatial_shapes.new_zeros(
            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
        
        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)

        # decoder
        query = query_embed.permute(1, 0, 2) # (num_q, bs, embed_dims)
        if memory_query is not None:
            memory_query = memory_query.permute(1, 0, 2)

        inter_states, inter_references = self.decoder(
            query=query,
            key=None,
            value=feat_flatten,
            query_pos=None,
            key_padding_mask=mask_flatten,
            reference_points=init_reference_points,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            reg_branches=reg_branches,
            cls_branches=cls_branches,
            memory_query=memory_query,
            memory_bank=memory_bank,
            **kwargs)
        
        return inter_states, init_reference_points, inter_references

================================================
FILE: plugin/models/transformer_utils/__init__.py
================================================
from .deformable_transformer import DeformableDetrTransformer_, DeformableDetrTransformerDecoder_
from .base_transformer import PlaceHolderEncoder
from .CustomMSDeformableAttention import CustomMSDeformableAttention
from .MapTransformer import MapTransformer, MapTransformerDecoder_new, MapTransformerLayer

================================================
FILE: plugin/models/transformer_utils/base_transformer.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from mmcv.cnn import xavier_init, constant_init
from mmcv.cnn.bricks.registry import (ATTENTION,
                                      TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import (MultiScaleDeformableAttention,
                                         TransformerLayerSequence,
                                         build_transformer_layer_sequence)
from mmcv.runner.base_module import BaseModule

from mmdet.models.utils.builder import TRANSFORMER

@TRANSFORMER_LAYER_SEQUENCE.register_module()
class PlaceHolderEncoder(nn.Module):

    def __init__(self, *args, embed_dims=None, **kwargs):
        super(PlaceHolderEncoder, self).__init__()
        self.embed_dims = embed_dims

    def forward(self, *args, query=None, **kwargs):
        
        return query

================================================
FILE: plugin/models/transformer_utils/deformable_transformer.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
import warnings

import torch
import torch.nn as nn
from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init
from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
                                      TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
                                         TransformerLayerSequence,
                                         build_transformer_layer_sequence)
from mmcv.runner.base_module import BaseModule
from torch.nn.init import normal_

from mmdet.models.utils.builder import TRANSFORMER

from mmdet.models.utils.transformer import Transformer

try:
    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
except ImportError:
    warnings.warn(
        '`MultiScaleDeformableAttention` in MMCV has been moved to '
        '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV')
    from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention

from .fp16_dattn import MultiScaleDeformableAttentionFp16

def inverse_sigmoid(x, eps=1e-5):
    """Inverse function of sigmoid.
    Args:
        x (Tensor): The tensor to do the
            inverse.
        eps (float): EPS avoid numerical
            overflow. Defaults 1e-5.
    Returns:
        Tensor: The x has passed the inverse
            function of sigmoid, has same
            shape with input.
    """
    x = x.clamp(min=0, max=1)
    x1 = x.clamp(min=eps)
    x2 = (1 - x).clamp(min=eps)
    return torch.log(x1 / x2)

@TRANSFORMER_LAYER_SEQUENCE.register_module()
class DeformableDetrTransformerDecoder_(TransformerLayerSequence):
    """Implements the decoder in DETR transformer.
    Args:
        return_intermediate (bool): Whether to return intermediate outputs.
        coder_norm_cfg (dict): Config of last normalization layer. Default：
            `LN`.
    """

    def __init__(self, *args, 
            return_intermediate=False, coord_dim=2, kp_coord_dim=2, **kwargs):

        super(DeformableDetrTransformerDecoder_, self).__init__(*args, **kwargs)
        self.return_intermediate = return_intermediate
        self.coord_dim = coord_dim
        self.kp_coord_dim = kp_coord_dim

    def forward(self,
                query,
                *args,
                reference_points=None,
                valid_ratios=None,
                reg_branches=None,
                **kwargs):
        """Forward function for `TransformerDecoder`.
        Args:
            query (Tensor): Input query with shape
                `(num_query, bs, embed_dims)`.
            reference_points (Tensor): The reference
                points of offset. has shape
                (bs, num_query, 4) when as_two_stage,
                otherwise has shape ((bs, num_query, 2).
            valid_ratios (Tensor): The radios of valid
                points on the feature map, has shape
                (bs, num_levels, 2)
            reg_branch: (obj:`nn.ModuleList`): Used for
                refining the regression results. Only would
                be passed when with_box_refine is True,
                otherwise would be passed a `None`.
        Returns:
            Tensor: Results with shape [1, num_query, bs, embed_dims] when
                return_intermediate is `False`, otherwise it has shape
                [num_layers, num_query, bs, embed_dims].
        """
        output = query
        intermediate = []
        intermediate_reference_points = []
        for lid, layer in enumerate(self.layers):

            reference_points_input = \
                reference_points[:, :, None,:self.kp_coord_dim] * \
                valid_ratios[:, None,:,:self.kp_coord_dim]
            # if reference_points.shape[-1] == 3 and self.kp_coord_dim==2:
            output = layer(
                output,
                *args,
                reference_points=reference_points_input[...,:self.kp_coord_dim],
                **kwargs)
            output = output.permute(1, 0, 2)

            if reg_branches is not None:
                tmp = reg_branches[lid](output)   
                new_reference_points = tmp
                new_reference_points[..., :self.kp_coord_dim] = tmp[
                    ..., :self.kp_coord_dim] + inverse_sigmoid(reference_points)
                new_reference_points = new_reference_points.sigmoid()
                if reference_points.shape[-1] == 3 and self.kp_coord_dim==2:
                    reference_points[...,-1] = tmp[...,-1].sigmoid().detach()  
                reference_points[...,:self.coord_dim] = new_reference_points.detach()

            output = output.permute(1, 0, 2)
            if self.return_intermediate:
                intermediate.append(output)
                intermediate_reference_points.append(reference_points)

        if self.return_intermediate:
            return torch.stack(intermediate), torch.stack(
                intermediate_reference_points)

        return output, reference_points


@TRANSFORMER.register_module()
class DeformableDetrTransformer_(Transformer):
    """Implements the DeformableDETR transformer.
    Args:
        as_two_stage (bool): Generate query from encoder features.
            Default: False.
        num_feature_levels (int): Number of feature maps from FPN:
            Default: 4.
        two_stage_num_proposals (int): Number of proposals when set
            `as_two_stage` as True. Default: 300.
    """

    def __init__(self,
                 as_two_stage=False,
                 num_feature_levels=1,
                 two_stage_num_proposals=300,
                 coord_dim=2,
                 **kwargs):
        super(DeformableDetrTransformer_, self).__init__(**kwargs)
        self.as_two_stage = as_two_stage
        self.num_feature_levels = num_feature_levels
        self.two_stage_num_proposals = two_stage_num_proposals
        self.embed_dims = self.encoder.embed_dims
        self.coord_dim = coord_dim
        self.init_layers()

    def init_layers(self):
        """Initialize layers of the DeformableDetrTransformer."""
        self.level_embeds = nn.Parameter(
            torch.Tensor(self.num_feature_levels, self.embed_dims))

        if self.as_two_stage:
            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
            self.pos_trans = nn.Linear(self.embed_dims * 2,
                                       self.embed_dims * 2)
            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
        else:
            self.reference_points_embed = nn.Linear(self.embed_dims, self.coord_dim)

    def init_weights(self):
        """Initialize the transformer weights."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        for m in self.modules():
            if isinstance(m, MultiScaleDeformableAttention):
                m.init_weights()
            elif isinstance(m,MultiScaleDeformableAttentionFp16):
                m.init_weights()
        if not self.as_two_stage:
            xavier_init(self.reference_points_embed, distribution='uniform', bias=0.)
        normal_(self.level_embeds)

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios, device):
        """Get the reference points used in decoder.
        Args:
            spatial_shapes (Tensor): The shape of all
                feature maps, has shape (num_level, 2).
            valid_ratios (Tensor): The radios of valid
                points on the feature map, has shape
                (bs, num_levels, 2)
            device (obj:`device`): The device where
                reference_points should be.
        Returns:
            Tensor: reference points used in decoder, has \
                shape (bs, num_keys, num_levels, 2).
        """
        reference_points_list = []
        for lvl, (H, W) in enumerate(spatial_shapes):
            #  TODO  check this 0.5
            ref_y, ref_x = torch.meshgrid(
                torch.linspace(
                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
                torch.linspace(
                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
            ref_y = ref_y.reshape(-1)[None] / (
                valid_ratios[:, None, lvl, 1] * H)
            ref_x = ref_x.reshape(-1)[None] / (
                valid_ratios[:, None, lvl, 0] * W)
            ref = torch.stack((ref_x, ref_y), -1)
            reference_points_list.append(ref)
        reference_points = torch.cat(reference_points_list, 1)
        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
        return reference_points

    def get_valid_ratio(self, mask):
        """Get the valid radios of feature maps of all  level."""
        _, H, W = mask.shape
        valid_H = torch.sum(~mask[:, :, 0], 1)
        valid_W = torch.sum(~mask[:, 0, :], 1)
        valid_ratio_h = valid_H.float() / H
        valid_ratio_w = valid_W.float() / W
        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
        return valid_ratio

    def get_proposal_pos_embed(self,
                               proposals,
                               num_pos_feats=128,
                               temperature=10000):
        """Get the position embedding of proposal."""
        scale = 2 * math.pi
        dim_t = torch.arange(
            num_pos_feats, dtype=torch.float32, device=proposals.device)
        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
        # N, L, 4
        proposals = proposals.sigmoid() * scale
        # N, L, 4, 128
        pos = proposals[:, :, :, None] / dim_t
        # N, L, 4, 64, 2
        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
                          dim=4).flatten(2)
        return pos

    def forward(self,
                mlvl_feats,
                mlvl_masks,
                query_embed,
                mlvl_pos_embeds,
                reg_branches=None,
                cls_branches=None,
                **kwargs):
        """Forward function for `Transformer`.
        Args:
            mlvl_feats (list(Tensor)): Input queries from
                different level. Each element has shape
                [bs, embed_dims, h, w].
            mlvl_masks (list(Tensor)): The key_padding_mask from
                different level used for encoder and decoder,
                each element has shape  [bs, h, w].
            query_embed (Tensor): The query embedding for decoder,
                with shape [num_query, c].
            mlvl_pos_embeds (list(Tensor)): The positional encoding
                of feats from different level, has the shape
                 [bs, embed_dims, h, w].
            reg_branches (obj:`nn.ModuleList`): Regression heads for
                feature maps from each decoder layer. Only would
                be passed when
                `with_box_refine` is True. Default to None.
            cls_branches (obj:`nn.ModuleList`): Classification heads
                for feature maps from each decoder layer. Only would
                 be passed when `as_two_stage`
                 is True. Default to None.
        Returns:
            tuple[Tensor]: results of decoder containing the following tensor.
                - inter_states: Outputs from decoder. If
                    return_intermediate_dec is True output has shape \
                      (num_dec_layers, bs, num_query, embed_dims), else has \
                      shape (1, bs, num_query, embed_dims).
                - init_reference_out: The initial value of reference \
                    points, has shape (bs, num_queries, 4).
                - inter_references_out: The internal value of reference \
                    points in decoder, has shape \
                    (num_dec_layers, bs,num_query, embed_dims)
                - enc_outputs_class: The classification score of \
                    proposals generated from \
                    encoder's feature maps, has shape \
                    (batch, h*w, num_classes). \
                    Only would be returned when `as_two_stage` is True, \
                    otherwise None.
                - enc_outputs_coord_unact: The regression results \
                    generated from encoder's feature maps., has shape \
                    (batch, h*w, 4). Only would \
                    be returned when `as_two_stage` is True, \
                    otherwise None.
        """
        assert self.as_two_stage or query_embed is not None

        feat_flatten = []
        mask_flatten = []
        lvl_pos_embed_flatten = []
        spatial_shapes = []
        for lvl, (feat, mask, pos_embed) in enumerate(
                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
            bs, c, h, w = feat.shape
            spatial_shape = (h, w)
            spatial_shapes.append(spatial_shape)
            feat = feat.flatten(2).transpose(1, 2)
            mask = mask.flatten(1)
            pos_embed = pos_embed.flatten(2).transpose(1, 2)
            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            feat_flatten.append(feat)
            mask_flatten.append(mask)
        feat_flatten = torch.cat(feat_flatten, 1)
        mask_flatten = torch.cat(mask_flatten, 1)
        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
        spatial_shapes = torch.as_tensor(
            spatial_shapes, dtype=torch.long, device=feat_flatten.device)
        level_start_index = torch.cat((spatial_shapes.new_zeros(
            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
        valid_ratios = torch.stack(
            [self.get_valid_ratio(m) for m in mlvl_masks], 1)

        # reference_points = \
        #     self.get_reference_points(spatial_shapes,
        #                               valid_ratios,
        #                               device=feat.device)

        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
        # lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
        #     1, 0, 2)  # (H*W, bs, embed_dims)
        # memory = self.encoder(
        #     query=feat_flatten,
        #     key=None,
        #     value=None,
        #     query_pos=lvl_pos_embed_flatten,
        #     query_key_padding_mask=mask_flatten,
        #     spatial_shapes=spatial_shapes,
        #     reference_points=reference_points,
        #     level_start_index=level_start_index,
        #     valid_ratios=valid_ratios,
        #     **kwargs)

        memory = feat_flatten.permute(1, 0, 2)
        bs, _, c = memory.shape
        
        query_pos, query = torch.split(query_embed, c, dim=-1)
        reference_points = self.reference_points_embed(query_pos).sigmoid()
        init_reference_out = reference_points

        # decoder
        query = query.permute(1, 0, 2)
        memory = memory.permute(1, 0, 2)
        query_pos = query_pos.permute(1, 0, 2)
        inter_states, inter_references = self.decoder(
            query=query,
            key=None,
            value=memory,
            query_pos=query_pos,
            key_padding_mask=mask_flatten,
            reference_points=reference_points,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            valid_ratios=valid_ratios,
            reg_branches=reg_branches,
            **kwargs)

        inter_references_out = inter_references
        return inter_states, init_reference_out, inter_references_out

================================================
FILE: plugin/models/transformer_utils/fp16_dattn.py
================================================
from turtle import forward
import warnings
try:
    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
except ImportError:
    warnings.warn(
        '`MultiScaleDeformableAttention` in MMCV has been moved to '
        '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV')
    from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
from mmcv.runner import force_fp32, auto_fp16
from mmcv.cnn.bricks.registry import ATTENTION


from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
from mmcv.cnn.bricks.transformer import build_attention

import math
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd.function import Function, once_differentiable

from mmcv import deprecated_api_warning
from mmcv.cnn import constant_init, xavier_init
from mmcv.cnn.bricks.registry import ATTENTION
from mmcv.runner import BaseModule
from mmcv.utils import ext_loader
ext_module = ext_loader.load_ext(
    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
from torch.cuda.amp import custom_bwd, custom_fwd


@ATTENTION.register_module()
class MultiScaleDeformableAttentionFp16(BaseModule):

    def __init__(self, attn_cfg=None,init_cfg=None,**kwarg):
        super(MultiScaleDeformableAttentionFp16,self).__init__(init_cfg)

        # import ipdb; ipdb.set_trace()
        self.deformable_attention = build_attention(attn_cfg)
        self.deformable_attention.init_weights()
        self.fp16_enabled = False


    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points','identity'))
    def forward(self, query,
                key=None,
                value=None,
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=None,
                spatial_shapes=None,
                level_start_index=None,
                **kwargs):
        # import ipdb; ipdb.set_trace()
        return self.deformable_attention(query,
                key=key,
                value=value,
                identity=identity,
                query_pos=query_pos,
                key_padding_mask=key_padding_mask,
                reference_points=reference_points,
                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index,**kwargs)


class MultiScaleDeformableAttnFunctionFp32(Function):

    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
                sampling_locations, attention_weights, im2col_step):
        """GPU version of multi-scale deformable attention.
        Args:
            value (Tensor): The value has shape
                (bs, num_keys, mum_heads, embed_dims//num_heads)
            value_spatial_shapes (Tensor): Spatial shape of
                each feature map, has shape (num_levels, 2),
                last dimension 2 represent (h, w)
            sampling_locations (Tensor): The location of sampling points,
                has shape
                (bs ,num_queries, num_heads, num_levels, num_points, 2),
                the last dimension 2 represent (x, y).
            attention_weights (Tensor): The weight of sampling points used
                when calculate the attention, has shape
                (bs ,num_queries, num_heads, num_levels, num_points),
            im2col_step (Tensor): The step used in image to column.
        Returns:
            Tensor: has shape (bs, num_queries, embed_dims)
        """

        ctx.im2col_step = im2col_step
        output = ext_module.ms_deform_attn_forward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            im2col_step=ctx.im2col_step)
        ctx.save_for_backward(value, value_spatial_shapes,
                              value_level_start_index, sampling_locations,
                              attention_weights)
        return output

    @staticmethod
    @once_differentiable
    @custom_bwd
    def backward(ctx, grad_output):
        """GPU version of backward function.
        Args:
            grad_output (Tensor): Gradient
                of output tensor of forward.
        Returns:
             Tuple[Tensor]: Gradient
                of input tensors in forward.
        """
        value, value_spatial_shapes, value_level_start_index,\
            sampling_locations, attention_weights = ctx.saved_tensors
        grad_value = torch.zeros_like(value)
        grad_sampling_loc = torch.zeros_like(sampling_locations)
        grad_attn_weight = torch.zeros_like(attention_weights)

        ext_module.ms_deform_attn_backward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            grad_output.contiguous(),
            grad_value,
            grad_sampling_loc,
            grad_attn_weight,
            im2col_step=ctx.im2col_step)

        return grad_value, None, None, \
            grad_sampling_loc, grad_attn_weight, None


def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
                                        sampling_locations, attention_weights):
    """CPU version of multi-scale deformable attention.
    Args:
        value (Tensor): The value has shape
            (bs, num_keys, mum_heads, embed_dims//num_heads)
        value_spatial_shapes (Tensor): Spatial shape of
            each feature map, has shape (num_levels, 2),
            last dimension 2 represent (h, w)
        sampling_locations (Tensor): The location of sampling points,
            has shape
            (bs ,num_queries, num_heads, num_levels, num_points, 2),
            the last dimension 2 represent (x, y).
        attention_weights (Tensor): The weight of sampling points used
            when calculate the attention, has shape
            (bs ,num_queries, num_heads, num_levels, num_points),
    Returns:
        Tensor: has shape (bs, num_queries, embed_dims)
    """

    bs, _, num_heads, embed_dims = value.shape
    _, num_queries, num_heads, num_levels, num_points, _ =\
        sampling_locations.shape
    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
                             dim=1)
    sampling_grids = 2 * sampling_locations - 1
    sampling_value_list = []
    for level, (H_, W_) in enumerate(value_spatial_shapes):
        # bs, H_*W_, num_heads, embed_dims ->
        # bs, H_*W_, num_heads*embed_dims ->
        # bs, num_heads*embed_dims, H_*W_ ->
        # bs*num_heads, embed_dims, H_, W_
        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
            bs * num_heads, embed_dims, H_, W_)
        # bs, num_queries, num_heads, num_points, 2 ->
        # bs, num_heads, num_queries, num_points, 2 ->
        # bs*num_heads, num_queries, num_points, 2
        sampling_grid_l_ = sampling_grids[:, :, :,
                                          level].transpose(1, 2).flatten(0, 1)
        # bs*num_heads, embed_dims, num_queries, num_points
        sampling_value_l_ = F.grid_sample(
            value_l_,
            sampling_grid_l_,
            mode='bilinear',
            padding_mode='zeros',
            align_corners=False)
        sampling_value_list.append(sampling_value_l_)
    # (bs, num_queries, num_heads, num_levels, num_points) ->
    # (bs, num_heads, num_queries, num_levels, num_points) ->
    # (bs, num_heads, 1, num_queries, num_levels*num_points)
    attention_weights = attention_weights.transpose(1, 2).reshape(
        bs * num_heads, 1, num_queries, num_levels * num_points)
    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
                                              num_queries)
    return output.transpose(1, 2).contiguous()


@ATTENTION.register_module()
class MultiScaleDeformableAttentionFP32(BaseModule):
    """An attention module used in Deformable-Detr. `Deformable DETR:
    Deformable Transformers for End-to-End Object Detection.
      <https://arxiv.org/pdf/2010.04159.pdf>`_.
    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_heads (int): Parallel attention heads. Default: 64.
        num_levels (int): The number of feature map used in
            Attention. Default: 4.
        num_points (int): The number of sampling points for
            each query in each head. Default: 4.
        im2col_step (int): The step used in image_to_column.
            Default: 64.
        dropout (float): A Dropout layer on `inp_identity`.
            Default: 0.1.
        batch_first (bool): Key, Query and Value are shape of
            (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
        norm_cfg (dict): Config dict for normalization layer.
            Default: None.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=4,
                 im2col_step=64,
                 dropout=0.1,
                 batch_first=False,
                 norm_cfg=None,
                 init_cfg=None):
        super().__init__(init_cfg)
        if embed_dims % num_heads != 0:
            raise ValueError(f'embed_dims must be divisible by num_heads, '
                             f'but got {embed_dims} and {num_heads}')
        dim_per_head = embed_dims // num_heads
        self.norm_cfg = norm_cfg
        self.dropout = nn.Dropout(dropout)
        self.batch_first = batch_first

        # you'd better set dim_per_head to a power of 2
        # which is more efficient in the CUDA implementation
        def _is_power_of_2(n):
            if (not isinstance(n, int)) or (n < 0):
                raise ValueError(
                    'invalid input for _is_power_of_2: {} (type: {})'.format(
                        n, type(n)))
            return (n & (n - 1) == 0) and n != 0

        if not _is_power_of_2(dim_per_head):
            warnings.warn(
                "You'd better set embed_dims in "
                'MultiScaleDeformAttention to make '
                'the dimension of each attention head a power of 2 '
                'which is more efficient in our CUDA implementation.')

        self.im2col_step = im2col_step
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_heads = num_heads
        self.num_points = num_points
        self.sampling_offsets = nn.Linear(
            embed_dims, num_heads * num_levels * num_points * 2)
        self.attention_weights = nn.Linear(embed_dims,
                                           num_heads * num_levels * num_points)
        self.value_proj = nn.Linear(embed_dims, embed_dims)
        self.output_proj = nn.Linear(embed_dims, embed_dims)
        self.init_weights()

    def init_weights(self):
        """Default initialization for Parameters of Module."""
        constant_init(self.sampling_offsets, 0.)
        thetas = torch.arange(
            self.num_heads,
            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (grid_init /
                     grid_init.abs().max(-1, keepdim=True)[0]).view(
                         self.num_heads, 1, 1,
                         2).repeat(1, self.num_levels, self.num_points, 1)
        for i in range(self.num_points):
            grid_init[:, :, i, :] *= i + 1

        self.sampling_offsets.bias.data = grid_init.view(-1)
        constant_init(self.attention_weights, val=0., bias=0.)
        xavier_init(self.value_proj, distribution='uniform', bias=0.)
        xavier_init(self.output_proj, distribution='uniform', bias=0.)
        self._is_init = True

    @deprecated_api_warning({'residual': 'identity'},
                            cls_name='MultiScaleDeformableAttention')
    def forward(self,
                query,
                key=None,
                value=None,
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=None,
                spatial_shapes=None,
                level_start_index=None,
                **kwargs):
        """Forward Function of MultiScaleDeformAttention.
        Args:
            query (Tensor): Query of Transformer with shape
                (num_query, bs, embed_dims).
            key (Tensor): The key tensor with shape
                `(num_key, bs, embed_dims)`.
            value (Tensor): The value tensor with shape
                `(num_key, bs, embed_dims)`.
            identity (Tensor): The tensor used for addition, with the
                same shape as `query`. Default None. If None,
                `query` will be used.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`. Default
                None.
            reference_points (Tensor):  The normalized reference
                points with shape (bs, num_query, num_levels, 2),
                all elements is range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area.
                or (N, Length_{query}, num_levels, 4), add
                additional two dimensions is (w, h) to
                form reference boxes.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            spatial_shapes (Tensor): Spatial shape of features in
                different levels. With shape (num_levels, 2),
                last dimension represents (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape ``(num_levels, )`` and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
        Returns:
             Tensor: forwarded results with shape [num_query, bs, embed_dims].
        """

        if value is None:
            value = query

        if identity is None:
            identity = query
        if query_pos is not None:
            query = query + query_pos
        if not self.batch_first:
            # change to (bs, num_query ,embed_dims)
            query = query.permute(1, 0, 2)
            value = value.permute(1, 0, 2)

        bs, num_query, _ = query.shape
        bs, num_value, _ = value.shape
        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value

        value = self.value_proj(value)
        if key_padding_mask is not None:
            value = value.masked_fill(key_padding_mask[..., None], 0.0)
        value = value.view(bs, num_value, self.num_heads, -1)
        sampling_offsets = self.sampling_offsets(query).view(
            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
        attention_weights = self.attention_weights(query).view(
            bs, num_query, self.num_heads, self.num_levels * self.num_points)
        attention_weights = attention_weights.softmax(-1)

        attention_weights = attention_weights.view(bs, num_query,
                                                   self.num_heads,
                                                   self.num_levels,
                                                   self.num_points)
        if reference_points.shape[-1] == 2:
            offset_normalizer = torch.stack(
                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
            sampling_locations = reference_points[:, :, None, :, None, :] \
                + sampling_offsets \
                / offset_normalizer[None, None, None, :, None, :]
        elif reference_points.shape[-1] == 4:
            sampling_locations = reference_points[:, :, None, :, None, :2] \
                + sampling_offsets / self.num_points \
                * reference_points[:, :, None, :, None, 2:] \
                * 0.5
        else:
            raise ValueError(
                f'Last dim of reference_points must be'
                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
        if torch.cuda.is_available():
            output = MultiScaleDeformableAttnFunctionFp32.apply(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)
        else:
            output = multi_scale_deformable_attn_pytorch(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)

        output = self.output_proj(output)

        if not self.batch_first:
            # (num_query, bs ,embed_dims)
            output = output.permute(1, 0, 2)

        return self.dropout(output) + identity

================================================
FILE: plugin/models/utils/__init__.py
================================================


================================================
FILE: plugin/models/utils/query_update.py
================================================
import math
import torch
import torch.nn as nn 
import numpy as np
from mmcv.cnn import bias_init_with_prob, xavier_init


class Embedder:
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        self.create_embedding_fn()
        
    def create_embedding_fn(self):
        embed_fns = []
        d = self.kwargs['input_dims']
        out_dim = 0
        if self.kwargs['include_input']:
            embed_fns.append(lambda x : x)
            out_dim += d
            
        max_freq = self.kwargs['max_freq_log2']
        N_freqs = self.kwargs['num_freqs']
        
        if self.kwargs['log_sampling']:
            freq_bands = 2.**torch.linspace(0., max_freq, steps=N_freqs)
        else:
            freq_bands = torch.linspace(2.**0., 2.**max_freq, steps=N_freqs)
            
        for freq in freq_bands:
            for p_fn in self.kwargs['periodic_fns']:
                embed_fns.append(lambda x, p_fn=p_fn, freq=freq : p_fn(x * freq))
                out_dim += d
                    
        self.embed_fns = embed_fns
        self.out_dim = out_dim
        
    def embed(self, inputs):
        return torch.cat([fn(inputs) for fn in self.embed_fns], -1)


class MotionMLP(nn.Module):
    ''' 
    Args:
        c_dim (int): dimension of latent code c
        f_dim (int): feature dimension
    '''

    def __init__(self, c_dim, f_dim=512, identity=True):
        super().__init__()
        self.c_dim = c_dim
        self.f_dim = f_dim
        self.identity = identity

        multires = 10
        embed_kwargs = {
                'include_input' : True,
                'input_dims' : c_dim,
                'max_freq_log2' : multires-1,
                'num_freqs' : multires,
                'log_sampling' : True,
                'periodic_fns' : [torch.sin, torch.cos],
        }
        self.pos_embedder = Embedder(**embed_kwargs)

        self.fc = nn.Sequential(
            nn.Linear(f_dim + self.pos_embedder.out_dim, 2*f_dim),
            nn.LayerNorm(2*f_dim),
            nn.ReLU(),
            nn.Linear(2*f_dim, f_dim)
        )
        self.init_weights()

    def init_weights(self):
        for m in self.fc:
            for param in m.parameters():
                if param.dim() > 1:
                    nn.init.xavier_uniform_(param)
            

    def forward(self, x, pose_info):
        pose_embed = self.pos_embedder.embed(pose_info)
        xc = torch.cat([x, pose_embed], dim=-1)
        out = self.fc(xc)

        if self.identity:
            out = out + x
        
        return out


================================================
FILE: plugin/models/utils/renderer_track.py
================================================
import os.path as osp
import os
#import av2.geometry.interpolate as interp_utils
import numpy as np
import copy
import cv2
import matplotlib.pyplot as plt
from PIL import Image

def remove_nan_values(uv):
    is_u_valid = np.logical_not(np.isnan(uv[:, 0]))
    is_v_valid = np.logical_not(np.isnan(uv[:, 1]))
    is_uv_valid = np.logical_and(is_u_valid, is_v_valid)

    uv_valid = uv[is_uv_valid]
    return uv_valid

def points_ego2img(pts_ego, extrinsics, intrinsics):
    pts_ego_4d = np.concatenate([pts_ego, np.ones([len(pts_ego), 1])], axis=-1)
    pts_cam_4d = extrinsics @ pts_ego_4d.T
    
    uv = (intrinsics @ pts_cam_4d[:3, :]).T
    uv = remove_nan_values(uv)
    depth = uv[:, 2]
    uv = uv[:, :2] / uv[:, 2].reshape(-1, 1)

    return uv, depth

def draw_polyline_ego_on_img(polyline_ego, img_bgr, extrinsics, intrinsics, color_bgr, thickness):
    if polyline_ego.shape[1] == 2:
        zeros = np.zeros((polyline_ego.shape[0], 1))
        polyline_ego = np.concatenate([polyline_ego, zeros], axis=1)

    polyline_ego = interp_utils.interp_arc(t=500, points=polyline_ego)
    
    uv, depth = points_ego2img(polyline_ego, extrinsics, intrinsics)

    h, w, c = img_bgr.shape

    is_valid_x = np.logical_and(0 <= uv[:, 0], uv[:, 0] < w - 1)
    is_valid_y = np.logical_and(0 <= uv[:, 1], uv[:, 1] < h - 1)
    is_valid_z = depth > 0
    is_valid_points = np.logical_and.reduce([is_valid_x, is_valid_y, is_valid_z])

    if is_valid_points.sum() == 0:
        return
    
    uv = np.round(uv[is_valid_points]).astype(np.int32)

    draw_visible_polyline_cv2(
        copy.deepcopy(uv),
        valid_pts_bool=np.ones((len(uv), 1), dtype=bool),
        image=img_bgr,
        color=color_bgr,
        thickness_px=thickness,
    )

def draw_visible_polyline_cv2(line, valid_pts_bool, image, color, thickness_px):
    """Draw a polyline onto an image using given line segments.

    Args:
        line: Array of shape (K, 2) representing the coordinates of line.
        valid_pts_bool: Array of shape (K,) representing which polyline coordinates are valid for rendering.
            For example, if the coordinate is occluded, a user might specify that it is invalid.
            Line segments touching an invalid vertex will not be rendered.
        image: Array of shape (H, W, 3), representing a 3-channel BGR image
        color: Tuple of shape (3,) with a BGR format color
        thickness_px: thickness (in pixels) to use when rendering the polyline.
    """
    line = np.round(line).astype(int)  # type: ignore
    for i in range(len(line) - 1):

        if (not valid_pts_bool[i]) or (not valid_pts_bool[i + 1]):
            continue

        x1 = line[i][0]
        y1 = line[i][1]
        x2 = line[i + 1][0]
        y2 = line[i + 1][1]

        # Use anti-aliasing (AA) for curves
        image = cv2.line(image, pt1=(x1, y1), pt2=(x2, y2), color=color, thickness=thickness_px, lineType=cv2.LINE_AA)


COLOR_MAPS_BGR = {
    # bgr colors
    'divider': (0, 0, 255),
    'boundary': (0, 255, 0),
    'ped_crossing': (255, 0, 0),
    'centerline': (51, 183, 255),
    'drivable_area': (171, 255, 255)
}

COLOR_MAPS_PLT = {
    'divider': 'r',
    'boundary': 'g',
    'ped_crossing': 'b',
    'centerline': 'orange',
    'drivable_area': 'y',
}

CAM_NAMES_AV2 = ['ring_front_center', 'ring_front_right', 'ring_front_left',
    'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left',
    ]
CAM_NAMES_NUSC = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
    'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT',]

class Renderer(object):
    """Render map elements on image views.

    Args:
        cat2id (dict): category to class id
        roi_size (tuple): bev range
        dataset (str): 'av2' or 'nusc'
    """

    def __init__(self, cat2id, roi_size, dataset='av2'):
        self.roi_size = roi_size
        self.cat2id = cat2id
        self.id2cat = {v: k for k, v in cat2id.items()}
        if dataset == 'av2':
            self.cam_names = CAM_NAMES_AV2
        else:
            self.cam_names = CAM_NAMES_NUSC

    def render_bev_from_vectors(self, vectors, labels, out_path, id_info=None, score_info=None):
        '''Render bev segmentation using vectorized map elements.
        
        Args:
            vectors (list): list of vectorized map elements.
            labels (list): list of labels of map elements.
            out_dir (str): output directory
        '''
        if id_info is not None:
            assert len(vectors) == len(id_info)
        if score_info is not None:
            assert len(vectors) == len(score_info)

        car_img = Image.open('resources/car.png')

        plt.figure(figsize=(self.roi_size[0], self.roi_size[1]))
        plt.xlim(-self.roi_size[0] / 2, self.roi_size[0] / 2)
        plt.ylim(-self.roi_size[1] / 2, self.roi_size[1] / 2)
        plt.axis('off')
        plt.imshow(car_img, extent=[-2.5, 2.5, -2.0, 2.0])

        for idx in range(len(labels)):
            cat = self.id2cat[labels[idx].item()]
            color = COLOR_MAPS_PLT[cat]
            vector = vectors[idx].detach().cpu().numpy()
            pts = vector[:, :2]
            x = np.array([pt[0] for pt in pts])
            y = np.array([pt[1] for pt in pts])
            # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], angles='xy', color=color,
            #     scale_units='xy', scale=1)
            # for i in range(len(x)):
            plt.plot(x, y, 'o-', color=color, linewidth=20, markersize=50)
            if id_info is not None:
                vec_id = int(id_info[idx])
                mid_idx = len(x) // 2
                if vec_id == -1:
                    plt.text(x[mid_idx], y[mid_idx], 'FP', fontsize=100, color=color)
                else:
                    plt.text(x[mid_idx], y[mid_idx], '{}'.format(vec_id), fontsize=100, color=color)
            
            if score_info is not None:
                mid_idx = len(x) // 2
                plt.text(x[mid_idx]-1, y[mid_idx]+2, '{:.2f}'.format(score_info[idx]), fontsize=100, color='purple')
                
                    
        plt.savefig(out_path, bbox_inches='tight', dpi=40)
        plt.close()

    def render_bev_from_mask(self, semantic_mask, out_path):
        '''Render bev segmentation from semantic_mask.
        
        Args:
            semantic_mask (array): semantic mask.
            out_dir (str): output directory
        '''

        c, h, w = semantic_mask.shape
        bev_img = np.ones((3, h, w), dtype=np.uint8) * 255
        if 'drivable_area' in self.cat2id:
            drivable_area_mask = semantic_mask[self.cat2id['drivable_area']]
            bev_img[:, drivable_area_mask == 1] = \
                    np.array(COLOR_MAPS_BGR['drivable_area']).reshape(3, 1)
        
        # NOTE: the semantic mask has been changed into instance masks for our use
        for label in range(c):
            cat = self.id2cat[label]
            if cat == 'drivable_area':
                continue
            valid = semantic_mask[label] == 1
            bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1)

        #for label in range(c):
        #    cat = self.id2cat[label]
        #    if cat == 'drivable_area':
        #        continue
        #    mask = semantic_mask[label]
        #    valid = mask == 1
        #    bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1)
        
        cv2.imwrite(out_path, bev_img.transpose((1, 2, 0)))
        

================================================
FILE: requirements.txt
================================================
av2
nuscenes-devkit
einops==0.6.1
numpy==1.23.5
numba==0.53.0
Shapely==1.8.5
yapf==0.40.1
setuptools==59.5.0
imageio-ffmpeg==0.4.9


================================================
FILE: tools/benchmark.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import time
import torch
from mmcv import Config
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint, wrap_fp16_model
import sys
from mmdet3d.datasets import build_dataset
from mmdet3d.models import build_detector
from tools.misc.fuse_conv_bn import fuse_module

def parse_args():
    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', default=None, help='checkpoint file')
    parser.add_argument('--samples', default=2000, help='samples to benchmark')
    parser.add_argument(
        '--log-interval', default=50, help='interval of logging')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)

    import sys, os
    sys.path.append(os.path.abspath('.'))  
    if hasattr(cfg, 'plugin'):
        if cfg.plugin:
            import importlib
            if hasattr(cfg, 'plugin_dir'):
                def import_path(plugin_dir):
                    _module_dir = os.path.dirname(plugin_dir)
                    _module_dir = _module_dir.split('/')
                    _module_path = _module_dir[0]

                    for m in _module_dir[1:]:
                        _module_path = _module_path + '.' + m
                    print(_module_path)
                    plg_lib = importlib.import_module(_module_path)

                plugin_dirs = cfg.plugin_dir
                if not isinstance(plugin_dirs,list):
                    plugin_dirs = [plugin_dirs,]
                for plugin_dir in plugin_dirs:
                    import_path(plugin_dir)
                
            else:
                # import dir is the dirpath for the config file
                _module_dir = os.path.dirname(args.config)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]
                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                print(_module_path)
                plg_lib = importlib.import_module(_module_path)

    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    cfg.model.pretrained = None
    cfg.data.test.test_mode = True

    # build the dataloader
    # TODO: support multiple images per gpu (only minor changes are needed)
    dataset = build_dataset(cfg.data.test)
    from plugin.datasets.builder import build_dataloader

    data_loader = build_dataloader(
            dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=False,
            shuffle=False,
            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
        )

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    if args.checkpoint is not None:
        load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
       model = fuse_module(model)

    model = MMDataParallel(model, device_ids=[0])

    model.eval()

    # the first several iterations may be very slow so skip them
    num_warmup = 5
    pure_inf_time = 0

    # benchmark with several samples and take the average
    for i, data in enumerate(data_loader):
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        with torch.no_grad():
            model(return_loss=False, rescale=True, **data)

        torch.cuda.synchronize()
        elapsed = time.perf_counter() - start_time

        if i >= num_warmup:
            pure_inf_time += elapsed
            if (i + 1) % args.log_interval == 0:
                fps = (i + 1 - num_warmup) / pure_inf_time
                print(f'Done image [{i + 1:<3}/ {args.samples}], '
                      f'fps: {fps:.1f} img / s')

        if (i + 1) == args.samples:
            pure_inf_time += elapsed
            fps = (i + 1 - num_warmup) / pure_inf_time
            print(f'Overall fps: {fps:.1f} img / s')
            break


if __name__ == '__main__':
    main()


================================================
FILE: tools/data_converter/__init__.py
================================================


================================================
FILE: tools/data_converter/argoverse_converter.py
================================================
from functools import partial
from multiprocessing import Pool
import multiprocessing
from random import sample
import time
import mmcv
import logging
from pathlib import Path
from os import path as osp
import os
from av2.datasets.sensor.av2_sensor_dataloader import AV2SensorDataLoader
import argparse


CAM_NAMES = ['ring_front_center', 'ring_front_right', 'ring_front_left',
    'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left',
    # 'stereo_front_left', 'stereo_front_right',
    ]

FAIL_LOGS = [
    '01bb304d-7bd8-35f8-bbef-7086b688e35e',
    '453e5558-6363-38e3-bf9b-42b5ba0a6f1d',
    '75e8adad-50a6-3245-8726-5e612db3d165',
    '54bc6dbc-ebfb-3fba-b5b3-57f88b4b79ca',
    'af170aac-8465-3d7b-82c5-64147e94af7d',
    '6e106cf8-f6dd-38f6-89c8-9be7a71e7275',
]

def parse_args():
    parser = argparse.ArgumentParser(description='Data converter arg parser')
    parser.add_argument(
        '--data-root',
        type=str,
        help='specify the root path of dataset')
    parser.add_argument(
        '--newsplit',
        action='store_true')
    parser.add_argument(
        '--nproc',
        type=int,
        default=64,
        required=False,
        help='workers to process data')
    args = parser.parse_args()
    return args

def create_av2_infos_mp(root_path,
                        info_prefix,
                        log_ids,
                        split,
                        dest_path=None,
                        num_multithread=64, 
                        newsplit=False):
    """Create info file of av2 dataset.

    Given the raw data, generate its related info file in pkl format.

    Args:
        root_path (str): Path of the data root.
        info_prefix (str): Prefix of the info file to be generated.
        dest_path (str): Path to store generated file, default to root_path
        split (str): Split of the data.
            Default: 'train'
    """
    
    if dest_path is None:
        dest_path = root_path

    for i in FAIL_LOGS:
        if i in log_ids:
            log_ids.remove(i)
    # dataloader by original split
    train_loader = AV2SensorDataLoader(Path(osp.join(root_path, 'train')), 
        Path(osp.join(root_path, 'train')))
    val_loader = AV2SensorDataLoader(Path(osp.join(root_path, 'val')), 
        Path(osp.join(root_path, 'val')))
    test_loader = AV2SensorDataLoader(Path(osp.join(root_path, 'test')), 
        Path(osp.join(root_path, 'test')))
    loaders = [train_loader, val_loader, test_loader]

    print('collecting samples...')
    start_time = time.time()
    print('num cpu:', multiprocessing.cpu_count())
    print(f'using {num_multithread} threads')

    # ignore warning from av2.utils.synchronization_database
    sdb_logger = logging.getLogger('av2.utils.synchronization_database')
    prev_level = sdb_logger.level
    sdb_logger.setLevel(logging.CRITICAL)

    pool = Pool(num_multithread)
    fn = partial(get_data_from_logid, loaders=loaders, data_root=root_path)
    
    rt = pool.map_async(fn, log_ids)
    pool.close()
    pool.join()
    results = rt.get()

    samples = []
    discarded = 0
    sample_idx = 0
    for _samples, _discarded in results:
        for i in range(len(_samples)):
            _samples[i]['sample_idx'] = sample_idx
            sample_idx += 1
        samples.extend(_samples)
        discarded += _discarded
    
    sdb_logger.setLevel(prev_level)
    print(f'{len(samples)} available samples, {discarded} samples discarded')

    id2map = {}
    for log_id in log_ids:
        for i in range(3):
            if log_id in loaders[i]._sdb.get_valid_logs():
                loader = loaders[i]
        
        map_path_dir = osp.join(loader._data_dir, log_id, 'map')
        map_fname = os.path.basename(str(list(Path(map_path_dir).glob("log_map_archive_*.json"))[0]))
        map_fname = osp.join(map_path_dir, map_fname)
        id2map[log_id] = map_fname

    print('collected in {:.1f}s'.format(time.time() - start_time))
    infos = dict(samples=samples, id2map=id2map)

    if newsplit:
        info_path = osp.join(dest_path,
                                    '{}_map_infos_{}_newsplit.pkl'.format(info_prefix, split))
    else:
        info_path = osp.join(dest_path,
                                    '{}_map_infos_{}.pkl'.format(info_prefix, split))
    print(f'saving results to {info_path}')
    mmcv.dump(infos, info_path)

def get_data_from_logid(log_id, loaders, data_root):
    samples = []
    discarded = 0

    # find corresponding loader
    for i in range(3):
        if log_id in loaders[i]._sdb.get_valid_logs():
            loader = loaders[i]
    
    # use lidar timestamps to query all sensors.
    # the frequency is 10Hz
    cam_timestamps = loader._sdb.per_log_lidar_timestamps_index[log_id]
    prev = -1
    for ts in cam_timestamps:
        cam_ring_fpath = [loader.get_closest_img_fpath(
                log_id, cam_name, ts
            ) for cam_name in CAM_NAMES]
        lidar_fpath = loader.get_closest_lidar_fpath(log_id, ts)

        # if bad sensor synchronization, discard the sample
        if None in cam_ring_fpath or lidar_fpath is None:
            discarded += 1
            continue

        cams = {}
        for i, cam_name in enumerate(CAM_NAMES):
            pinhole_cam = loader.get_log_pinhole_camera(log_id, cam_name)
            cams[cam_name] = dict(
                img_fpath=str(cam_ring_fpath[i]),
                intrinsics=pinhole_cam.intrinsics.K,
                extrinsics=pinhole_cam.extrinsics,
            )
        
        city_SE3_ego = loader.get_city_SE3_ego(log_id, int(ts))
        e2g_translation = city_SE3_ego.translation
        e2g_rotation = city_SE3_ego.rotation
        
        samples.append(dict(
            e2g_translation=e2g_translation,
            e2g_rotation=e2g_rotation,
            cams=cams, 
            lidar_fpath=str(lidar_fpath),
            prev=prev,
            # map_fpath=map_fname,
            token=str(ts),
            log_id=log_id,
            scene_name=log_id))
        
        prev = str(ts)

    return samples, discarded


if __name__ == '__main__':
    args = parse_args()
    with open('tools/data_converter/av2_train_split.txt') as f:
        train_split = [s.strip() for s in f.readlines()]
    with open('tools/data_converter/av2_val_split.txt') as f:
        val_split = [s.strip() for s in f.readlines()]
    
    test_split = None
    if not args.newsplit:
        train_split = os.listdir(osp.join(args.data_root, 'train'))
        val_split = os.listdir(osp.join(args.data_root, 'val'))
        test_split = os.listdir(osp.join(args.data_root, 'test'))

    create_av2_infos_mp(
        root_path=args.data_root,
        split='train',
        log_ids=train_split,
        info_prefix='av2',
        dest_path=args.data_root,
        newsplit=args.newsplit)
    
    create_av2_infos_mp(
        root_path=args.data_root,
        split='val',
        log_ids=val_split,
        info_prefix='av2',
        dest_path=args.data_root,
        newsplit=args.newsplit)

    if test_split:
        create_av2_infos_mp(
            root_path=args.data_root,
            split='test',
            log_ids=test_split,
            info_prefix='av2',
            dest_path=args.data_root,)

================================================
FILE: tools/data_converter/av2_train_split.txt
================================================
bb110668-5037-3c04-bd34-34cf1ace8d0f
8beeb8db-28f9-396c-b752-17f906505948
247f91e7-3177-33ad-b99e-0e0a4dc76751
40bfcbec-ec59-3731-8e75-67f0bddc3b01
ef4a46c4-138e-3478-b94e-3e60a567ec7d
cf6a99cb-b8bc-34d7-bdca-30e50e66cd74
575d8d24-ba88-3b18-84c0-df5b29dccfde
e66d1403-755b-3f63-938b-a2a69446a48a
ed93e1bb-7bbc-3444-8fc8-08a271438fc8
0d37aee4-6508-33a2-998d-724834e80030
7ce2a2ce-eed6-36d7-ba62-dda9acddb070
06852209-b868-306b-b492-ee6dbc914cf8
e424d4f7-4b28-322f-b630-31d42ae528eb
768cf7e2-eb6c-3468-969e-e3b0fd87b34e
b2a8a9aa-19cd-3ffd-b02c-0f2a47d1d0eb
928e282f-d1a0-3e85-9582-0b33664c49e8
5d8f4b0a-27f8-3889-925f-e9a146a395eb
58a6bfcf-071e-3a6d-90f4-0e4cbdc298eb
90f360d8-35f7-3c54-b2da-e99b354bc4cb
e5178032-d260-3bc8-968e-a5cb98b6ae5a
af170aac-8465-3d7b-82c5-64147e94af7d
a0cb0614-ee71-3cf3-b891-a4274883362f
de56b100-508b-3479-81fe-735349f8e8de
b87683ae-14c5-321f-8af3-623e7bafc3a7
92b900b1-ac4a-3d41-b118-e42c66382c91
2e95b33b-8ea1-3b48-875b-2f35f3092059
75e8adad-50a6-3245-8726-5e612db3d165
ccb4e29d-e88f-3fbe-8958-67cfd62350a3
a3f59292-ad1d-370a-afde-64a9e16b341c
f648b945-6c70-3105-bd23-9502894e37d4
df5d0b0e-5bcb-304a-a167-18b92d0f1d45
6aae7f38-21de-31bf-8761-29d458338958
80f31501-6533-3257-9870-b0c4dbf61967
57356998-297c-330a-af4e-c6a1ad64f923
0b9321c1-9bc2-4727-beb8-8046aa4bb6c4
f7cf93d8-f7bd-3799-8500-fbe842a96f63
108d2060-8bef-3d1c-88c5-c8295f596595
4667e48c-4d16-38be-b277-6b0013d6588c
3576c0f5-c1b5-35bb-a0c4-ee95cba5c754
a059b6b9-ca26-4881-bcf7-d202433de0c2
53a8391f-b2d7-341f-95ce-b9174d48e040
fd4e2c4c-f7e9-3110-8e32-28d3add3937d
1ab241cf-f9c5-3f8a-88bd-4e14baad8ede
2aea7bd1-432a-43c5-9445-651102487f65
d9530d0a-b83e-44a3-910a-2b5bb8f1fb80
072c8e90-a51c-3429-9cdf-4dababb4e9d8
97738d01-b24f-365e-8818-2463149154d2
b8a5a7a9-1c4f-4f2e-96a6-565e727b24d0
c71cd96c-8e3f-3861-9ece-fcbabebc63a8
c08279c0-10b4-3d21-b13f-a1c1a0b87f8b
f3f8f680-e471-3662-a06a-0c00e6d88f43
d78b78a0-2322-32c2-833a-e42ddc132d30
271f4204-dd77-350b-b9db-5dabc4191985
4766da89-ca13-3e92-b53e-00bc710e9bba
b8d83d8e-1574-3efd-b067-a3ed422a259e
2a9b2658-097e-3f8b-a817-22f2553c5de8
c6c55112-0078-3867-a63f-1861a0125b8d
286790ed-9dec-305b-bcad-4f8153301e7c
c2c0e6bc-05e5-30dd-8e5e-0e7b6106ad30
4f363f6a-e51c-4d22-b232-cf78f1520966
8e5022ed-87a9-4480-b2a7-a7c0494f5c7b
95acebfe-c694-3dab-9e6d-01cb501ff426
14c8d182-9586-3f21-ad20-c4e19ec03e2c
5ccb359a-2986-466c-88b2-a16f51774a8f
4eb237d9-9f8c-3426-9da6-4aad349ff8aa
a783b484-437b-3569-bd44-4f83ad9e05cf
d3efe9ba-f10a-35e7-b17e-6850c66693fe
8a11791c-1d8f-3b12-bacc-38aa982b0003
8c019de9-7043-37bc-9498-b5858e7240af
63f32613-2856-4ab0-898d-f881d74eb8bf
56c24ed8-68c5-3a08-8e3c-19646ac670e5
8bc34c99-1b8f-3463-b0e7-12bf1eb222b3
71d1938d-536e-39eb-beb4-bb4f5b607427
7b0bf9d6-084a-31d4-9e52-d9b582a0ec84
b09ba294-96b3-3c45-aeed-c40a309e5f4e
6dc6e668-549e-33b8-b952-ed8e807b1d48
595acd37-183c-489f-bb8a-c299a86b74c0
a9a24c80-600f-3f85-b4d9-a70ceccad385
718a2f8d-954a-3cd8-89e6-43898cf21fee
c8ec7be0-92aa-3222-946e-fbcf398c841e
7039e410-b5ab-35aa-96bc-2c4b89d3c5e3
72cf3ca1-1a9e-3254-bca0-29c62521e454
65387aee-4490-38b9-8f4f-1fc43bd4ac06
27c03d98-6ac3-38a3-ba5e-102b184d01ef
58fed0d4-97d5-469b-89a4-4394838e10c7
d1395998-7e8a-417d-91e9-5ca6ec045ee1
e72ef05c-8b94-3885-a34f-fff3b2b954b4
36aec72e-5086-376c-b109-295b128e77e1
ecbe6def-7560-352c-8822-b2b92613e1e4
544a8102-0ef5-3044-921e-dc0544370376
cf5aaa11-4f92-3377-a7a2-861f305023eb
20dd185d-b4eb-3024-a17a-b4e5d8b15b65
ded5ef6e-46ea-3a66-9180-18a6fa0a2db4
42f92807-0c5e-3397-bd45-9d5303b4db2a
f668074d-d6c6-3ea7-a7b5-aad0a1203b03
d3ca0450-2167-38fb-b34b-449741cb38f3
1ad57a00-cc61-3f5f-9e2a-9981a57e9856
418da7f6-88e4-388b-a69f-44cabd24ed55
613558a1-6a8e-3fda-8fa6-1045a064a0f9
6626b7b2-bcc8-4497-ae92-307ceacd5010
9a82e3c8-1738-3f85-9245-1d3717171d2f
a69fa035-5121-3a39-a3ce-e33e9f54b506
14896a70-a440-34d0-b68e-fd9882557da6
c42d34f3-78d5-35be-9c47-77d297caebfe
c9fc62c5-a289-36e3-a900-7e7807eb2716
e368d49a-e02a-3374-876e-8325f66c3574
6b012ed3-c322-3522-b52c-b4f24f894d4c
30e94a6b-ca9e-3d2c-9099-86700ce845f9
5f016e44-0f38-3837-9111-58ec18d1a5e6
4bf8e9ff-e1a1-3a22-a9d1-80f3846c0263
841fe537-5e76-3b3a-8298-75fa1a41a14b
64037371-4aa4-3fed-97f4-bafc1674caee
b5ea60b0-2540-4efe-b60e-f421ade3c128
34fe8fbd-2b1a-3552-94c4-e80d1e85e5c2
67d5fb0a-baf8-32f4-9316-18ce755f3e8b
4619e709-c9c0-3b26-923f-23a78e231136
285dcef7-9f00-3c9c-baca-6c8269210ac4
67d00dd9-fd33-3518-85f5-e26353373c33
156a412d-3699-3c1c-9ada-6ab587347996
9d65b03d-b59d-3a45-ba66-e313d3cdab40
022af476-9937-3e70-be52-f65420d52703
539b7a79-54c6-30ef-8e06-210d42c79125
b4fc7eef-819c-35a6-b937-358ffb5c2aa0
ec02cf7e-36d3-3e9d-8835-3b6c27975bea
6ee14358-31cb-3c6d-82f5-54d6a20444de
a7bcdabb-f9b7-3c16-806d-3ddf1c2d49a2
54bc6dbc-ebfb-3fba-b5b3-57f88b4b79ca
45433055-2b69-3cff-8135-67b3bfa04034
f61bcee1-2964-3c4b-95a5-697df5f42f47
902d5e72-b665-3615-af2d-a2b6164864b2
812a45e8-b5d9-389b-9151-09c57ce969df
5d391e54-adec-3584-adf0-5025d7564e1b
9f6d282e-f573-31d5-80e6-9a193e80cd7d
f6350a4f-eee8-31bd-8520-28f9c81c98a8
ac3e33eb-5a53-369d-9e5a-7950392bfe99
3844f8a9-b7d4-4919-8e9b-a0370ad29ec0
57636c80-9335-3aae-af70-11755db93854
38f30522-2d43-3ff3-a94b-84887ab1671d
968e77fb-9ab1-3427-8984-9e3028b186e2
7186d7d2-453e-4193-b327-72b66bbe3fd0
3a1b3424-700b-3b65-9e89-90772a8e24ea
7e3d8631-3b7d-38c1-b833-ee7cfa7235ca
93b755f1-f865-44dc-a98d-cae5eb1a25d0
3ca11a5e-50b2-3cc3-af7a-ce7ab02b9954
c6940de1-dccf-3b42-9c39-bbb9feb2d638
caabc342-aed4-3104-8195-7461a4add481
afbdd2e0-696a-3222-a20a-2023baf8e5af
70e92418-e4e6-32a2-98c8-9844b1c24f92
4bab74cd-aba9-4752-9e1f-006cc639d63e
982bcae9-1840-37f4-9278-3dbb63031aac
23808d42-e4df-3a0d-b713-fe20e09a4f39
e7e7ebad-79d2-3ae1-95ce-f3035bc8f719
eec8ae97-8de4-354a-b11a-d3a14b276479
d9c267be-f19a-3183-afe0-f0625a375743
e1e9d341-716f-3613-9ec2-2201c72361af
4aae26d1-aa71-30ae-b838-2a25d1f317f5
1c8f1189-c4fe-3303-bf2a-f88f5751b81e
4c18ef76-84ba-4a78-8275-7663101fffcf
26daba71-ca0a-37e9-9dc6-0f81f02c0afd
737314f0-997a-3cd1-a652-78453bfe2b57
fdb0578d-4fa7-37a7-b60d-5472b4d39136
f4c6ade0-7b9e-4ad7-8d86-13d2f4c91499
8aaa2fac-59f9-3a3d-98ee-f3dde8b4d781
6937b4e5-b5b4-3970-b5dd-9ad194e6c338
8911207d-fc3d-3009-bb35-18138197724f
7ebbdbeb-a8de-3612-8f22-6ce91980841d
b4d5e738-b937-33fd-8131-bf1df36f598a
1c7d3b85-6cec-373c-a4eb-5137d7cc6a7c
e38c1e1d-f0e9-3d73-8193-29cbea481b4c
98fd128c-4f32-40fc-a23c-7feb50c4478a
eed8593d-60e3-3e41-9fea-55f544b01749
52c9e613-61b3-3d17-9f6d-b28de8a14829
50d508e2-6753-4519-a8c3-ad94a76ee948
d2901fe5-4b64-3144-98e1-67ef5ef83fa7
db447b86-8103-3ee4-93de-2c838ba061dc
aed1b616-9d9a-36d3-a047-07ad3955fbb7
cd822baf-4aa1-33fa-bfe5-d91386598edb
0c143226-9c39-387c-a935-1391bed6dc75
d72c43a5-95bf-3a3e-9019-cf25cc0a61c0
f46707f9-435f-3a06-9017-deae11feab53
956dd277-e000-4c6c-af4a-aee4d86971c9
7f40c022-9f9d-3805-abf8-7533175b3f25
08734a1b-0289-3aa3-a6ba-8c7121521e26
41b6f7d7-e431-3992-b783-74b9edf42215
59a668bc-5caf-3ccc-8335-e9cff4c61d0e
9cdbe6f4-938f-4ac5-88f7-94a82bea715b
382dfbe0-836e-385c-86f2-f1afcf57a402
e4f6dbab-f2eb-3bd2-9dbc-88640e3b4a5f
a3e09a66-a921-3c4a-89e6-7fecf6854a3a
bba89165-0e5d-3052-abb6-6a61e37861a3
c0d36fde-5672-430a-9dd1-3e2a5d4f6cab
1e6f21fd-5c8a-3526-ac50-72adae89d6a8
aa630060-2eda-37bd-ae88-a513fd9fc8e3
a8a3297d-62f3-31ac-8db7-95ef53ce0d31
8346e544-4a73-3c88-9339-c7a21fbd3a2f
93c97162-a834-3331-b15c-e4ab278b1c6d
5037a27d-95f5-352f-9c64-5b8e75f574b2
74dd5c1d-7a9a-32d1-903a-fc57e07109b9
ebece6dc-ec92-326f-97ed-f66b2970e358
bec0f69b-832c-3898-b589-0127ddc282f3
27fba275-0b37-3033-b20d-8f9848f78b1c
7c539ecc-658a-3956-a9a4-6e7f5bd67373
6fdd8d39-7b04-365a-9941-e9e805b05ecf
43efcbe2-fe24-35b0-9e69-b07c1b0725d3
5f2b8881-3447-3905-99f8-def9d72aae42
b5f3900c-b421-3032-aef2-2e91a69d1163
4977e8a8-4e1f-3ca2-a44e-454cd3756a5f
fb2cce69-655d-3203-990a-74301895408d
74f15437-b85e-314a-9d86-7294b98bf07f
75a9cbdf-0bec-39f9-b536-5b37aacadf96
b6c04ab6-1c07-3e17-97d5-e870db090e52
4e302e79-1cb9-358f-a3fb-e133a655af4f
e7e178aa-931a-4674-9bff-9278a54e6aae
121007f3-a0cc-3795-9606-85108b800772
7ef4a6cc-7266-3a31-8dd4-01c3d3c58dcb
4d9e3bdf-7216-3161-8281-72863f3c2bf6
b7843066-abbd-3275-ac52-90a8363f65f7
633addf9-441c-35e7-868a-738aa612d51c
105f47eb-491c-3cab-91af-83c5bc1f6c48
66a40dcc-4de0-3f72-bff0-ca543ac5019d
098fe60e-bab0-32e2-89bc-bedced881911
d6ba4898-1369-3521-981c-b9ac57420418
ce5033ee-e74e-354a-9299-8aaefbd03f59
e65e405c-8aea-30f5-a926-1e0fbbeefb9f
db17141d-4d35-381d-9949-36ce767d6641
b66a9b8e-8fa8-3409-907f-a70ebd7051e1
e88132d0-4512-3d6f-a1c8-f60972332af5
df321672-461c-361b-aac9-e81cc9a88b9f
bb25d7d0-9146-46a0-8ff1-ebfc25d63417
9bdb4139-173f-33d3-8730-e29752d737d3
f4cb6ba4-cd0f-30cc-9cc9-52bd14bfb3cc
e28c16d0-084a-3dc4-aad4-9d157ca528de
ede387f4-f390-3f0e-a071-eb543b73ed73
74ec2f62-9d57-39a9-bf88-97006f64ee5c
a160c635-aa67-352c-a5e6-03b113493090
6180bbb1-95ce-381b-ba17-5411c5712824
d58d55ea-f30c-3622-8303-1574616b9865
6dadba1d-0f67-345b-bc5e-407ab8f7654c
3b2994cb-5f82-4835-9212-0cac8fb3d164
a88da814-ecc6-39c0-93ba-8a81f403a7e4
094c4119-eb33-3dfb-a18d-492cbdc8413a
bbdb1e21-62eb-3230-8cef-a3b091c5edad
aa82b61f-7156-3c68-95a4-b79cebd120eb
b0663029-8f8d-398a-8a28-81ba29224696
8aeeeeca-6a79-34ef-b667-835d53536a8f
f8412dbd-48b9-39f3-b534-08950f6e633b
c062ba0f-7591-3225-a57d-8181622dc2da
75449af9-61a5-3a4f-95ec-3a3dc35b4cbb
b7fbc13b-47ff-3e3f-a363-86d60ba664b8
79cb0109-4c92-3ede-8849-76cc6824b95a
c556f8e0-a001-3586-b2cf-d3256685c39f
ee27a871-85cf-494c-8519-f54815040af5
555a7659-ffce-39df-ba06-d9fcb2f812f0
1b8fc962-7036-4d7f-885e-40b631cbdeaf
7f7e4709-7596-35f9-89ac-d808178b1533
c2ec8955-1797-338f-9486-d7c41926f791
72ad5f22-3a9c-3758-81af-abda8181a622
2501c6d0-071c-3a7a-b51f-c8cbd37abe25
823371b1-3197-35d6-a6b7-bfd432e10440
63a006b5-07c8-375d-98e3-21466f5b9c6a
1a4e2d86-23d4-3a0d-a9ac-8b0936ae94ce
828ddef2-7609-3683-8e32-c21e7c07d6a6
df738339-958b-31fb-8e48-a4380f4c538a
3fbdfb6c-927f-4aaa-81b1-21b02efd4c01
c67a748c-1e93-3a6d-be38-daedf175f911
f9f6a7e9-4f79-3fdf-b1a7-ba300622f116
e1450d07-faed-3d97-b674-c6f8d2498d80
a7a2236e-8f8e-34aa-9343-722f9b3bb829
64b24fd1-f639-4f7e-a535-dbfe9fd737a1
44200521-4cad-3a5d-8568-e0f3f1ca24d4
444cce44-cc82-4620-b630-1b5849284ac7
e033cc8e-b23d-3fc6-8954-d90c5e98550e
fa9ec72a-cbcf-35dc-be20-4d0d9e7215ef
3c51357e-f6e9-3cda-9036-fe6e6cd442fe
1e51a567-b416-3c46-9424-05688ff851f7
2b443c95-d55f-3cc4-a2a1-ae4af293d8d9
3d7743c1-c0a5-3ab2-976e-84af93270f30
441871a2-a9c5-3048-b7e9-d88af5acb8f1
f2b0585b-ada3-3123-963e-14df7d96ca9e
c7f5e5c1-dc52-3619-8998-420b2e280d8a
855908a6-a848-3b7b-a4a3-bbab78a423cd
07e4fccb-eb2d-31e5-bbcb-6550d0860f64
f7cdc2d1-f59a-30a2-aae8-8bb81c769e6e
8223c3d0-3b08-3889-9cdc-a88592c4bd4a
de586ff4-3413-367d-befc-ad022b73592b
ac1b1697-42b9-4225-a666-d17f72204fa8
73539e96-eef2-3302-bdf4-a39e9d95b6e7
e0ba7664-d287-39df-8193-00d60cae1417
0a132537-3aec-35bb-af13-7faa0811000d
b29b43d7-3af9-363e-aaeb-8805d958f982
8ca98d88-67b5-385e-80f7-b32758668fab
a4f240a0-12d4-3542-a11f-0c592e90e4da
1844c439-b94c-332a-bb94-600818350eb4
ce0e814a-d9df-3975-a521-d8ae9a091e96
95a47a36-1041-3924-bbd0-4dcad52c323a
f54c1d50-48a3-4651-bfb0-50b87f13dc9e
890cf3b7-3385-390c-8b2e-132c744b5d2d
189c8512-b034-3d58-a372-cf48eacf02dd
4e391f98-31a6-330d-9252-d02aab82f5db
1d950a38-5c2f-39ce-9cd3-61249bc85194
4e1ac476-80a2-3612-bfd7-1abd24d2b644
abd4fe8d-7520-3b35-b8ac-4de367141b6f
0b97f5dd-c396-3c02-b07d-b7fdbcb6c3d0
6aa2ac89-6b25-3af6-ad59-221351189f4b
298715e3-b204-3bf5-b8c2-fe3be9e310e8
e1f37027-6a39-3eb1-b38a-3f2836b84735
9e684390-4af3-3ec5-b163-855bbd026ff1
1842383a-1577-3b7a-90db-41a9a6668ee2
7a1412d3-5a53-378f-85df-ba58b2408f46
91cded81-9f72-3930-bab7-5d3e3fa0a220
a7c9bb12-322e-3f8e-8798-cf57a4a72f99
648e8393-f46f-384b-9bd1-c25a2285077d
c69e348a-8e10-31dc-b71b-dd8e5cfd7211
87ca3d9f-f317-3efb-b1cb-aaaf525227e5
182ba3f7-b89a-36cc-ae40-32a341b0d3e9
f6cc0ebf-fc6a-3bf2-8bcb-76d8c43f194e
f2576c8a-da9b-450e-88cf-a70af1b0eadf
78683234-e6f1-3e4e-af52-6f839254e4c0
7dbc2eac-5871-3480-b322-246e03d954d2
20bcd747-ef60-391a-9f4a-ae99f049c260
11ba4e81-c26f-3cd1-827d-b6913bcef64e
eec284b2-840a-3c75-aa42-04d2e309bbe1
b50c4763-5d1e-37f4-a009-2244aeebabcd
15ec0778-826e-3ed7-9775-54fbf66997f4
e0ea281b-6956-3605-b720-71b54ec87d25
e8c9fd64-fdd2-422d-a2a2-6f47500d1d12
b8489c02-60d0-3f44-a3b4-9de62830d666
0b86f508-5df9-4a46-bc59-5b9536dbde9f
201fe83b-7dd7-38f4-9d26-7b4a668638a9
335aabef-269e-3211-a99d-2c3a3a8f8475
76916359-96f4-3274-81fe-bb145d497c11
22052525-4f85-3fe8-9d7d-000a9fffce36
4e3fedbb-847c-3d5b-8a62-c9ff84550985
77574006-881f-3bc8-bbb6-81d79cf02d83
dafe14f5-825c-4e7a-9009-6dfdfdd5b030
2f2321d2-7912-3567-a789-25e46a145bda
bbd19ca1-805a-3c22-8df3-cd7501aa06f3
58e82365-03bc-3b2f-b55a-a4ad0e3e792d
d770f926-bca8-31de-9790-73fbb7b6a890
b6500255-eba3-3f77-acfd-626c07aa8621
8749f79f-a30b-3c3f-8a44-dbfa682bbef1
47286726-5dd4-4e26-bd2d-5324f429e445
185d3943-dd15-397a-8b2e-69cd86628fb7
2ff4f798-78d9-3384-87e9-61928aa4cb6d
6803104a-bb06-402e-8471-e5af492db0a8
dc9077b9-2fe0-3d18-9b97-8067ff090874
7a2c222d-addc-30b2-aac6-596cb65a22e3
0fb7276f-ecb5-3e5b-87a8-cc74c709c715
3b3570b4-7b0b-3268-a571-b0889dbf40b6
e42aa296-0e5d-4733-87ec-131a82f917bc
19350c96-623d-4d77-af96-f8c23f00c358
02a00399-3857-444e-8db3-a8f58489c394
7e4d67b3-c3cc-3288-afe5-043602ea3c70
5c0584a3-52a6-3029-b6ff-ca45a19d8aa6
a1589ae2-2678-310e-91cc-c4b512cd7fa5
3de5b5d6-68c4-3c95-84ed-be7c83d829f8
9d16e76e-46ae-38c6-8399-99218514afde
2d403b7b-06e8-320c-b013-4f684ad53be2
f77889f6-ef5a-4eed-a4cd-5d67d4a6e9c5
1eb3360f-4c34-3310-9ce6-845ea9272c56
5546df9c-9310-3ed5-929a-d7da19e18bf8
a1358c59-b28d-3ddb-af1c-3a5d1c394ef5
1bd7db3a-0b42-31cf-ac1a-de88fd9fa721
a4400a38-bc38-391c-b102-ba385d7e475e
4fcdebe7-b52f-39e7-a5bc-c664eeba5e7b
f7d568d4-0836-3f47-b330-f8d204c4b96e
412ccada-28df-3de2-b394-9cba3fca5bdf
6f3dbf4b-9559-340c-a3e4-cbe655bf2059
84c98474-28d8-309e-91c7-9cf9539825ab
de23dfe1-c0b1-441b-810b-324090dc171b
deec57d0-d31b-31ec-aa75-88db5d9dadf5
e95c8cc2-ddb3-3e7b-b8c3-e7584a778464
c3388791-4fef-3278-a085-26121cf5f513
45488531-3648-3e2d-8f9c-3c287032112d
21c0472c-5ba2-3276-aad4-b9aa66cb5fa3
98e7f0eb-4676-3120-94f1-8a790581e6a4
28bd43de-e2b7-3c60-a626-0e525f639357
5c0afbc0-a6ee-37c2-aebd-c1927caf7340
5677a441-abd2-3b29-9f0b-333e181cc907
d8192bbb-3b00-3c68-a79a-65872ea4276f
0fdbd56a-1ff7-3624-81f9-03cd68fd5616
6ef553eb-6dbb-3a2a-ae3e-ed7090b8826a
bc20a6d3-2db2-3849-8843-1e1b8c93e5db
11a84740-18a3-3798-91c5-21dc9c765350
6a6e93f0-a130-3340-975b-b2c88b16d343
f6107596-76e0-3064-a4a6-86332a90e539
0f0cdd79-bc6c-35cd-9d99-7ae2fc7e165c
d67d020a-4d28-3bfd-891d-d6aa7dcf0a69
a674e2e5-3dfd-3dd5-8503-192357b0e96c
a89557fc-1268-36e5-9cce-335f2da27bc8
51428934-b0a7-3507-94e3-31d37bba38a3
f849731b-d288-3bec-8f35-6bea979f7dd8
91ac892f-d2c1-3143-b5c5-f0d4640cfc0d
b48a15fb-2e84-34df-946f-ad72b3d7296f
e7547e4c-1ebc-3428-8964-a5b91e81098e
069cc46d-38bb-309d-88cf-296a3d0c0820
8c52d911-fe34-3424-9864-d3fdfac38064
d33f667d-7b6c-39aa-9ba9-eac2fa615ae1
aedbd525-e6df-4c0c-8be6-61c27fe58fd6
81d2b40a-c579-3e9c-b520-bee26cda947d
3153b5b3-d381-3664-8f82-1d3c5ca841d2
c780d53a-2d37-3cd8-9e89-530966aef53e
88f47a10-87b4-3ea8-a0c7-a07d825b647d
a91d4c7b-bf55-3a0e-9eba-1a43577bcca8
25e5c600-36fe-3245-9cc0-40ef91620c22
d5d6f11c-3026-3e0e-9d67-c111233e22de
91aab547-1912-3b8e-8e7f-df3b202147bf
e1d68dde-22a9-3918-a526-0850b21ff2eb
9bb1f857-8b61-369f-a537-484c1323ae32
b6c4361a-7dd8-32a0-83d4-7f9d2beaed08
798354fc-30ee-36f4-83b4-f49c3b307db5
62a1e53b-b55c-36c2-bc5b-e216d494875a
47167c79-2ba4-369c-8db8-760a30b4c38a
2b044433-ddc1-3580-b560-d46474934089
380e5bf0-1c68-36a4-ac64-09a03b60bebf
8066e267-a653-3b43-8fce-a5a780912c82
3c56f1ef-d4df-30ae-80f3-0a5b22d4d3a6
3fa8c20e-a4b4-3af6-b9c4-6cb96f83916d
8e02e2db-2836-37ec-af33-a1cc2e6e49dc
a36f80a5-5edc-3842-80af-292ae639ee74
bb9be2e6-8f0e-3bb3-8bb9-5d9aa9df384d
dbe19bf6-93ad-372e-b96d-f7b652cdba93
1992ed13-948e-34e6-8d9b-a3416e545a95
a47ba6a9-ffa1-3979-bb40-512339284b8b
0a524e66-ee33-3b6c-89ef-eac1985316db
e743b441-ea8a-36d7-8124-f14dfa13a0e6
8d8b550e-d0be-3cbb-a371-49ec36fa619f
c85ebc24-0934-3423-9c14-f0fdbee64b68
65d3f43d-1969-35d4-bf86-bd5e4b1ac803
b51561d9-08b0-3599-bc78-016f1441bb91
8f317f00-f8b4-325e-a5c7-e4045427a610
2772dd5f-bc0a-47ea-ae19-a5e0dbef8f41
b98a7838-ac1f-339f-93c5-fe7f98ea8657
a146ab19-f4f3-334f-b830-fc68de83e26c
5481321f-d317-3e80-8061-6e9c635c4ca9
4a789b07-7578-36ec-89cd-68b01e0737fb
f8825b65-5631-3417-8309-bd5677d694aa
790d3c83-f6bf-348e-80e7-12f29240e598
d26b95e4-d200-34e2-92c9-c16fda4cd9dd
945f3b20-778a-3581-adef-544de4a089ef
65732efc-1564-3ff8-8c7c-4239a08c0d70
5c7ee953-d8b0-33ef-a491-0bb716763cfe
c67f439a-f945-33cb-8517-40c9fdf60d59
6f2f7d1e-8ded-35c5-ba83-3ca906b05127
72c31859-3676-3cbb-a773-0591d8d5799e
74a3e9ae-6811-4d11-a112-4c4963773cfe
f41d0e8f-856e-3f7d-a3f9-ff5ba7c8e06d
bd4a7d9d-14e1-3c17-873d-a74d0cd6a5d7
490f13c4-4c1f-3e3b-8a9f-0f27c6906b4e
ed6ad297-ee09-3532-bcfc-c16ad5a05c49
595ec33e-a1aa-3aaf-8821-8d1780db354c
3933d1a2-f121-3c8a-8b01-7738e58c045f
e0cfd042-ae29-3d21-bb47-81eb8f933ec8
b1a98ad6-9b3e-35fb-afae-70b279fcbfc0
1bf2bf1c-64d1-308f-afd1-220de9d30290
49a9df80-ab0a-31fb-9341-a79f7b0258dd
118a1e87-aff4-35f5-aa38-01504a63ddce
41c3597a-aab1-3123-85a1-dd5d459af461
9a8aea4b-9b61-3884-9f3c-84c3c36e6373
cea5f5c2-e786-30f5-8305-baead8923063
f03bfd11-5ba2-3bc6-ad76-4166b06491f5
e0d2fe70-8f98-3ce2-8d8f-4268a81f7169
da30abcf-652b-38df-a128-10942b225ec5
5e9fc665-2353-34da-a2e7-2094ab17e790
9b1da4e7-03a9-3277-91f3-ef6e610a6320
067b1c50-6567-3840-ab56-1ca2a0ed9c30
134bb8e9-9080-3bc5-948d-88d8cc034550
b56e3f47-72a6-34e8-9ada-b4169e28e5b9
84bb4b17-e7f2-3a1b-8c2b-6d6ec9a23e31
7c696d35-e34f-38b0-b4b4-e88803ad1f6a
8858428d-8fd5-3c3f-8ca4-d01f6e25e63c
93582b51-5be1-30cd-abb0-3eac16dd6dbc
32edd7c7-8a8f-360d-bcda-83ecf431e3e6
bdd7e8ba-f7fa-38d1-b6bf-9dc77334fec5
eb777faa-5b76-387e-a408-90524c6f2848
7ad46cf0-aa12-4050-ac2d-cf34b5f64d41
c990cafc-f96c-3107-b213-01d217b11272
61e56102-4d85-3a40-bbba-1a007c816f68
38609ed6-2445-3df3-bd92-849d3963510e
a359e053-a350-36cf-ab1d-a7980afaffa2
76038978-47aa-30ed-bfa1-2d63753a866c
c654b457-11d4-393c-a638-188855c8f2e5
5d062611-5417-3405-997c-1d1aefe4d85f
4058d838-75cb-35e2-af7e-a51aaa833271
6b0cc3b0-2802-33a7-b885-f1f1409345ac
bb533c69-1e0b-341c-bedd-ff25fe9b84bf
debbba6b-8cb8-3ab6-adfe-54fcc6b02839
133e2e0b-b0fe-3bb0-b1f9-c846fcfd29e8
edf3a727-664e-38be-b990-65d34012d926
ce34ff64-0faa-3fae-a79e-985f7a5172c9
0f257dcc-8606-3ef9-b17e-b022a3fc72c7
614812d4-3344-3975-a1c8-4131910c4a10
d9fd666a-8f55-38bb-8387-80fa44c29348
03b2cf2d-fb61-36fe-936f-36bbf197a8ac
adcf7d18-0510-35b0-a2fa-b4cea13a6d76
e574050e-f787-3186-9686-2e9aca8102a0
f3d1e3c3-2770-3504-a592-b62619598812
953087a4-f704-37fe-a60f-82877e84a413
d5d40b4c-48d9-3b68-903a-025eb0fa334d
0749e9e0-ca52-3546-b324-d704138b11b5
e757cddd-5ff5-305a-af11-d7c6747d3979
d97ae2c0-b8d1-341c-94b7-f19d5fd2982a
46d917cd-531c-330b-8d7b-979b51a8927f
03fba633-8085-30bc-b675-687a715536ac
6419dcfd-8777-35fa-924c-ebefccde0a9b
855ba280-cd69-348d-9107-69e28cb8ad99
ce0575bf-c2fc-38bd-9947-ea7494a799f9
e125bb91-dcaf-3013-9cc7-da653d7e11e1
49d76058-b4f0-3931-86fa-de160b4c1b88
b48d6d4b-f0dd-35da-850d-36a715691e2f
99a3270d-c5c5-3df7-9a2d-a612c8104d0e
7cb4b11f-3872-3825-83b5-622e1a2cdb28
dc4d148d-f84c-307c-b2b7-f0cd7c267f57
106d962b-911d-354d-961d-9abe93119b9c
14bf638b-8f0d-35b2-a369-6d846b5b3892
8aad8778-73ce-3fa0-93c7-804ac998667d
5cf52bbe-f7f4-30c9-a4c2-a1fbb93513e4
bee1146d-2e80-37e3-b08a-6ac8858e8973
cdd752d0-caee-3d95-b1db-7fc20cbbc783
9caf211e-3e6e-3996-8518-f617b9454e67
0a8a4cfa-4902-3a76-8301-08698d6290a2
e4279e3e-b7e1-3f43-aeef-2bfa2836dab6
6ff3a51a-e0ab-32be-beb5-4079e56933c6
4d324eb4-39f1-3837-9b97-c10db5d2b61d
a1537c1c-775b-3969-ae13-2e83e5a4728a
0d9e4cff-73ff-33eb-9981-795475e62faf
74648e09-358d-3183-9b40-278620befa40
ad319b98-6faa-3648-98bd-43afdbd20020
b9f73e2a-292a-3876-b363-3ebb94584c7a
7d1d720d-6708-3148-917a-b8dc78f1dcd9
f64ed43e-417a-31ad-a322-b6108bf99a71
4d7b84b9-0a03-3aa1-83f0-4766013c3fb1
c96a09c8-46ed-391f-8a66-c46fa8b76029
5c1db299-e2a2-35e5-84dd-acda8fb393bc
194b6c89-8060-3174-b402-308f72cb1c15
9ecbfef8-29c6-334a-b4ff-aa8201439826
c4ea1b05-c7d5-3b59-aed1-9f3d2621ac00
b403f8a3-4cad-333e-8557-d7da2e163f4b
4f1b4bb2-b30b-3537-8fed-dd8f843f5adb
d201af7e-48c8-34ad-be1c-e649af2cb5c2
0d8aab9f-4edf-3fb3-895a-ba64e8f2cfb2
81700b3c-2db4-3f72-935c-274d3607d6d2
62879808-1586-4d49-80fe-2f547e355191
dc9c2d63-083f-32c3-90ff-943ca823a245
e331aa95-3660-3c71-be9e-030bab0b8ee2
7c5e3704-33c8-3a4e-b032-9187a6f90206
35a15c5c-fa4a-3838-a724-396e112ec95c
5d55a63a-3146-32d9-89ec-e207e95ecbde
0322b098-7e42-34db-bcec-9a4d072191e9
a2f568b5-060f-33f0-9175-7e2062d86b6c
332b278a-a6b9-3bc3-b88c-241e4b03b4ef
0c61aea3-3cba-35f3-8971-df42cd5b9b1a
53f5011b-2a8f-3a73-9d86-805462bb542d
c1a6c20c-e336-3efa-81b6-7c1242d70bd2
7a17d467-9f29-3706-8e40-32bb7fb033de
0ab21841-0c08-3bae-8424-daa9b336683f
eb69a196-fb43-3ddf-9bbe-9d55fa1e8200
a3876690-9d49-3c98-9421-02cfe0ccb551
7c30c3fc-ea17-38d8-9c52-c75ccb112253
4935629c-fd9e-3b2f-b68e-9489c89585df
49e970c4-7364-33cb-a298-ead218e9a705
511b93af-f16e-3195-8628-fbb972a17f74
91923e20-9a05-32e0-ac53-8c09b0b60341
00a6ffc1-6ce9-3bc3-a060-6006e9893a1a
b5e6e498-54b3-37bb-b2a3-cdac33a18363
c730e199-fb8d-3abf-b7aa-bbc81bf8c08f
d37be0e2-8223-3eeb-a0e2-c4b75d5ff87b
9afab336-dbae-3f70-a669-46813f4570d7
ab3d8387-8e07-37f6-a74c-cf100fb6a612
1a10b0e6-569f-32db-95e8-10c074e353e8
382cf8af-6c8d-3ed9-907b-12214d2c7cb0
d842ce41-8d9c-3c0f-9c04-595d97be5140
7cd08674-1787-37d9-9365-988df023724b
f150d98f-0109-3380-8480-c6846fb8e9c8
bd90cd1a-38b6-33b7-adec-ba7d4207a8c0
52071780-5758-3ed4-8835-0d64ecdc5575
04994d08-156c-3018-9717-ba0e29be8153
bf360aeb-1bbd-3c1e-b143-09cf83e4f2e4
24642607-2a51-384a-90a7-228067956d05
78da7b7e-8ddf-3c7d-8716-eaa890106dd3
ff0dbfc5-8a7b-3a6e-8936-e5e812e45408
78f7cb5c-9d51-34f0-b356-9b3d83263c75
7606de8d-486c-4916-9cbb-002ee966f834
858d739b-a0ba-35aa-bafc-4f7988bcad17
b6e967f6-92bc-3bf5-99c9-1b0c4649fd67
de9cf513-a0cd-3389-bc79-3f9f6f261317
95bf6003-7068-3a78-a0c0-9e470a06e60f
f1275002-842e-3571-8f7d-05816bc7cf56
f292cc5c-7a90-360d-b62a-074c643bdf59
3bffdcff-c3a7-38b6-a0f2-64196d130958
472a240a-10cd-39cd-8681-558f7c7cf868
adf9a841-e0db-30ab-b5b3-bf0b61658e1e
a060c4c1-b9fc-39c1-9d30-d93a124c9066
6aaf5b08-9f84-3a2e-8a32-2e50e5e11a3c
a33a44fb-6008-3dc2-b7c5-2d27b70741e8
9e9bcfb7-601d-3d80-bc12-ef7025174beb
2e3f2ae7-9ab9-3aef-a3ce-a0a97a0cb1ab
f4c94798-4d77-36ab-bdc5-c1194e5e7aff
52971a8a-ed62-3bfd-bcd4-ca3308b594e0
0aa4e8f5-2f9a-39a1-8f80-c2fdde4405a2
fbee355f-8878-31fa-8ac8-b9a45a3f130a
214e388e-cbd7-3dde-a204-d2ec42298808
280269f9-6111-311d-b351-ce9f63f88c81
20d47f81-46e8-3adf-a0ca-564fbb5c599d
b2053fdc-0b94-30bc-aee7-5bc6fb7e9f52
02678d04-cc9f-3148-9f95-1ba66347dff9
29a00842-ead2-3050-b587-c5ef507e4125
9a448a80-0e9a-3bf0-90f3-21750dfef55a
e858fb96-6b1f-3025-b40a-f71fd8d28c32
d70dae33-b4b2-36da-a4eb-345ef1c484cc
386c34fc-ff56-371c-9288-6ba42620f23a
aaed41a5-47f2-3e0a-9645-2dbd871f744f
be0615bc-1d82-334b-9c98-6adf40406955
4abe4fc9-183a-3ec1-9434-bc74fb724c0f
b40c0cbf-5d35-30df-9f63-de088ada278e
67be173f-28a9-3bcc-b110-4b81dfe3bf5e
f554d503-4901-3b97-9516-a16398c66631
399064b4-6df3-3de8-8793-2738f8723ee3
ae908cc4-7301-3390-8940-eb9b679a8a39
a86ee261-b86b-34f7-92ab-be8367d1fc4c
3503b283-fbcd-3835-8779-0cb2b7ef55b0
1ca5291b-3178-3a93-a117-001497899b79
fb207d3b-d2d5-3100-94c0-9145aebc770b
f7c4cf87-6bab-3723-bd74-1c9ac5add9cb
65f1eefa-cbc3-3d53-9991-dc0500ae9183
9320afa3-ed05-3364-a017-ae7ddc5d26c7
b248d26b-9c48-3d5f-bda1-a05ec99c2d97
7c4e5ad1-d604-3e44-81ae-68f7bfe21d27
e4221cc6-a19d-31ca-bf94-031adb0ea390
6784f175-e69d-3802-99df-d21ec2081878
97ae6596-a903-3045-836b-34f8206c6cfe
48c9cd36-68bf-3bb9-ab95-5e0a6fee61ab
b42dc943-8b33-3b79-a260-14eb9f58a991
cf79d751-5d2a-3d5c-96a2-bb8d603f21e0
c2bbb391-a453-36af-b987-9d15f46b8589
803c44cc-e1de-3797-9b5f-15324a1604f8
af8471e6-6780-3df2-bc6a-1982a4b1b437
4e6d6bcd-8718-3e71-b9c1-7c352c991a56
6b6b2e8b-3f4d-3b7d-acaa-8f970cb12adb
a7f532a3-87de-3129-8864-258396fd0b50
b7cbdba9-18ac-393a-8352-4841ffee722e
557dd6a4-2b80-3264-9c13-f70094526174
d029a394-7118-33c9-896d-eabb894f58c4
8ee606e6-4cbd-3c07-8419-fbda836ccaac
ab8c747b-b9cb-3835-a275-54c56cb9a469
3e707e96-ad84-3e68-bea5-2f9ac502a2d9
1a7e18b5-d8dc-371d-be5f-03a37b113e81
7df1f32e-f059-3ac4-9d57-213f2f69b8b4
8e5442cf-8882-3b94-bc47-18fcad84bb20
3c27dfaf-1624-39d2-9075-158824ed8e8c
ff8e7fdb-1073-3592-ba5e-8111bc3ce48b
770a58e6-eff6-39b7-a265-fe7f202fe8b2
ff52c01e-3d7b-32b1-b6a1-bcff3459ccdd
47358aac-2ec0-3d45-a837-f2069ca7cee3
18bdf01b-6ba6-30a8-a707-1f1458529d3d
71283e26-905b-3811-b9e0-c10c0253769b
d0ba7a1b-f5ca-39d6-98d0-29c671baec65
29080565-8133-3274-80cf-6ea98078e50d
06e5ac08-f4cb-34ae-9406-3496f7cadc62
83faae69-e37e-4804-b7a9-684d4a900320
e4d53680-f7ef-364b-91a4-00e5aa91ab9b
c94991c0-3662-3936-972c-1af63db486d8
79f3de22-c643-3e97-96d5-f77274a458c0
5bd6bd4d-3c89-3794-9935-2d044ce6ef37
f3cc42c7-84a8-35c5-8683-13878bb9beeb
9a25fd14-783b-35c3-ab2d-df4687f82b5e
7ccdda39-69b1-36d1-89c8-2acc3823264b
71d95611-9032-3787-a66e-e26313b08d46
b5a1b0b0-a7fc-3a47-af82-9b25a81a8c0b
dd251cc5-736d-3b76-8ad3-3f6cb138178e
8a0ff1a2-9045-3be3-b67f-3914d88178ec
080b1ce2-9477-39ee-8233-b7f33e1dfe56
3dd173a6-8b21-3189-bd53-132919b96a48
2fff4135-98ec-3b82-a330-b73d8afdf36c
42c8449f-6e6d-3980-b54a-805eba6621c4
a1c1d559-0480-39d2-94f0-1a89f0226c4f
cd2353c2-0fb6-3e18-8281-4c0df1d3189a
d20c3612-a64d-3aa8-bd4a-58890413afbb
2716d83e-8c4f-39a3-a2a3-d5e255fe8a03
ea6895f2-504b-37b5-bfd0-cbf7017f22c3
3c3ed78e-1fcf-30ec-9e19-9bf142e2621d
20b00c37-4fe4-31dc-a258-dae253ae6992
c2f301b6-5d19-3296-a8ac-418ff48e052b
fac8a63c-6b75-39d0-9f57-4344fde0f794
256c185c-284a-343e-93f4-894eed474edd
8c54e429-a3de-3eb3-96f3-d3127e2cc18f
41d69427-364c-366e-94a5-8e556bcac39f
b436606f-daa4-337f-8103-4360bf4704d9
f84b4941-8e99-3957-b6f6-db1590338cf6
11420316-aec9-3ad9-8b4a-d618bcd180e9
48a52b7b-9391-3728-84f1-9aa6ca336214
f4d1a3c3-5002-336b-a67f-775b3725237e
2b6d18dc-4c95-3301-a498-3ed152798d5b
ab83611b-436e-3de7-aad1-f0c9ad254196
389069d7-e6db-3d22-9328-e228c002bf75
e123ba3f-99bd-3039-b6e7-8c62eaebf9c2
12c3c14b-9cf2-3434-9a5d-e0bfa332f6ce
7da33189-2698-3a98-b038-b0e5a271ee96
3a789fb0-5cd2-3710-b8ea-f32fce38e3ca
7d3f2f76-2f4f-3762-bf0f-f94f79eb0404
16af3863-0d31-3cd1-8fa2-58053ffb953a
80da8956-f418-319c-9f49-3d47d9002546
eb222d5d-0052-3ce7-9b87-19e09054a2c0
b28a3715-4624-3a54-9652-b8f0b293a5a8
3b2e6033-f37f-3a73-9fab-88317b9b6095
a4e62775-131f-37c4-9239-c38e3b254dad
f110598d-7e01-3ed7-a227-4e958987a31f
40870b19-3356-3e8e-a4a4-9f34eef8ea30
47972731-b0ea-3c38-a10f-5ffdd42329fc
991d11df-0265-3e41-b942-5b0c615d21e2
b81922e7-092f-3052-8cd1-fec6a6763295
c858bd6a-81ab-3f54-b46d-ffc091ef6945
9807c577-0dc0-3116-864b-cf46a1276389
a6817756-af01-32ec-829f-d9e56ef7b6e8
95312039-73b9-35a2-9aec-905494a4f7f0
5d333477-796b-3e49-bf41-0cdbed39c8dd
86519a39-4ce9-3d0b-a3f9-dd9aa26a2b25
76c3f58f-9003-3bdb-90a3-b87cfbfa1c3b
b213af37-7d89-342d-ae39-8a3c72159a01
cae56e40-8470-3c9c-af75-6e444189488f
e50e7698-de3d-355f-aca2-eddd09c09533
4c33fc38-5e59-34f8-96ba-4e5a404d3988
44adf4c4-6064-362f-94d3-323ed42cfda9
da036982-92bf-36a8-b880-4ccf4e20b74e
bdb9d309-f14b-3ff6-ad1f-5d3f3f95a13e
0b5142c1-420b-3fea-9e98-b87327ae22c6
0c3bad78-9f1e-395d-a376-2eb7499229fd
bf382949-3515-3c16-b505-319442937a43
19f53e16-9f99-3035-9672-7e860f3b0048
e13c06cb-cd01-380e-946f-6d92ac1af49d
1f434d15-8745-3fba-9c3e-ccb026688397
1da4a0aa-22ae-3958-856d-05303de1f576
f3cd0d0d-8b71-3266-9732-d9f0d5778eb6
96dd6923-994c-3afe-9830-b15bdfd60f64
6fa5051b-0220-3e04-8ae3-7a199c2f5877
32835bfa-e53b-3526-9ec0-b0efcd11cbdf
4fae2ef6-7112-309a-b926-448a5a3e1802
a9a3d5d7-e0c6-3f24-af35-2acadc1aa2d9
1c8648f9-e7a1-3056-a2c0-19c8827a6a50
c45888cf-30f5-3e27-abeb-4f55caecc1f0
d4c7aa45-dfd6-3d71-bb8a-40efd5110d3b
34c79495-dbdf-393d-bcc6-e6f92f797628
87e61f5a-083c-305e-9ff4-5f699e85900a
e95e20d1-7f04-34b9-9105-4333f11bf6b9
b0116f1c-f88f-3c09-b4bf-fc3c8ebeda56
9da07440-1001-3b00-a29f-c8bdc2f2b7d4
2ee0eda7-151a-3957-bab5-1e5370192122
c91f95de-d041-32f6-8b18-628a220be100
c6b7a5fb-8cd8-3ee2-8e99-b788eb02e731


================================================
FILE: tools/data_converter/av2_val_split.txt
================================================
22dcf96c-ef5e-376b-9db5-dc9f91040f5e
5b1d8b11-4f90-3577-be0b-193e102fda82
3f9796e9-c892-3915-b719-3292df878ece
b5a7ff7e-d74a-3be6-b95d-3fc0042215f6
4d73c4eb-5de9-300c-b34f-ff5d0af89653
e40d67c5-3749-397e-aa2a-7dfe576a31b0
b43d449e-daaf-33a1-bb7f-3f7a0b5f056c
69c0ec7c-e289-3c4d-ade3-d2287ec34026
f5a3ee79-a131-3f8a-91e9-a6475d778149
d3dc783e-663a-31b1-bd85-46e04ca693db
9239d493-31d7-3dd0-a05f-03d50a242392
9946b521-ea55-3c52-9fd1-71afc3abf3c6
1579b300-e7f5-3318-97c2-2c827b0c411e
14f5485e-7417-3a5b-9be3-ec88461d03d4
41e31361-569b-3ed8-bafd-2308b7a9377e
4207ef92-0b3b-4708-8868-4ffcaef308e0
5d40499f-c9be-38b9-a0cb-cd234850ba85
73d86f1c-5e5c-3842-b671-7f29c78ccc55
fa708289-f2b2-399e-989e-53f83fa379c5
d1695c5e-08a9-44fd-8f45-93c23f700c8b
dfc6d65f-20f5-389d-a5cd-81c1c7ecb11f
2c652f9e-8db8-3572-aa49-fae1344a875b
4a78c5db-041b-347b-9821-ceb82f99e3f8
677c7bcc-f29b-34ae-a91d-74cb863117c8
2a930061-3d8c-3915-8aac-f81199db95d8
b6c86134-d7e6-3af6-9db5-8aba3df4f7a7
b1527e96-5a5d-3adc-a893-314ab3a6012e
a4087bac-8194-4c9e-8b2d-4bda58773a3c
58d01358-5927-36fa-9e11-d18d1dc1f4f0
87ce1d90-ca77-363b-a885-ec0ef6783847
460324ea-c769-38db-bba9-044643c8780e
074d2237-ed1b-34d7-a2fc-68edbce50bb2
e94f58d9-177b-31be-aa05-e6dd10d04124
c453a8e7-d3da-317a-946b-f8e9678a8582
31f062b7-dd17-3e7e-945d-198e91597de9
ff6adc87-5f47-32f7-b36a-546453c0e332
3c58172c-7a07-3ad4-bdf6-7cae60928c56
11995cbe-e076-3a35-910d-1e56ecf2c3c8
8feb3dbe-4450-3aeb-b22b-e65128aa696b
3cd2847c-604e-32b4-af19-6cd0da0dcdc5
8de6abb6-6589-3da7-8e21-6ecc80004a36
04973bcf-fc64-367c-9642-6d6c5f363b61
a4f72852-c2ff-35d3-8375-e52055508240
b9b1564c-66d0-4597-a664-2735cf2ffd04
bffb0c9e-5e3a-3251-ab5e-299491b53cbf
0b1b993a-68b3-3232-9afa-fc9942b5b79b
3b68c074-1680-3a93-92e5-5b711406f2fe
c049334b-5568-3ca0-9b28-0c09d00b7bb3
ad870270-f3d8-3790-866a-78d61b5b76ee
78cbd619-8ded-35b8-87a1-38c4f4aeb82d
7ce85124-312b-35f0-a1a2-32206f75a947
5426cd2f-f4b9-3660-99d2-6617bb0f1b26
349c4c1c-9561-360f-9ae7-59772335d54b
96284bbc-6b58-330f-a5a6-76cd518543f0
ef625e46-d0d4-38b9-9403-5614e7b39ec8
cd83b7cd-e2e7-34f6-bee7-1ff5ca3ed665
adc1fad7-de31-371f-810b-140576d9accc
a98c14bf-bf01-3ae5-992d-ea9f0a18e3c7
0b324587-6097-3f92-a07a-a44f48c85d9e
35f32393-e82f-3b20-b214-1f6a43d60f23
36b38cbf-f6c5-3a12-8e7a-eb281cc9c2fc
b9fcb487-363e-30a7-a316-a42dd81d8fe5
c222c78d-b574-4b9d-82e1-96a4f3f8bb27
51bbdd4d-3065-34ae-b369-b6e0444f34db
ba67827f-6b99-3d2a-96ab-7c829eb999bb
d5fa4d54-74ba-369c-a758-636441ad7f07
5f278cdd-ca28-3c53-8f5c-04e62308811d
c865c156-0f26-411c-a16c-be985333f675
7b7f86ca-b430-3872-a131-ff5b4a6b5dcf
6da5d01e-54a7-3d7a-b86b-e0d6f8d3971d
3fca5366-2b2c-387b-b63c-7ae8f9e0cec1
5b614cfd-21c2-3b03-94c8-2a6c6bee166c
416f2e1c-0ffd-3089-97d2-0514b818f8d1
0c6e62d7-bdfa-3061-8d3d-03b13aa21f68
7e48bba5-438c-3813-9ce2-97c98868afed
756f4ed0-5352-31e4-b3c6-2841b9e779d7
6d3bfbc9-45dc-316e-a94c-a441371d0571
fd5c6932-2ee2-3cfb-9bdc-0b30bfb33a91
4a60c567-f167-3890-aa7e-01e75ccc40e0
87918291-e9ba-3759-be1a-4c874ca40997
4487b659-692e-3b35-9d1e-a230279ed646
aa539866-29e4-353e-95a9-b6d321b53b33
df1935dc-1e5f-3f4d-bdcb-e6c2bcb07667
087695bd-c662-3e86-83b4-aedc3b8eec36
19711b73-c43b-3922-be61-8c44df707a7d
1886b0d1-9c5e-326f-99df-30b64044638f
b6642e23-d100-3680-8882-9f3b753b2eef
89f79c55-6698-3037-bd2e-d40c81af169a
ca4144fb-10e5-3895-836f-87001f59ac65
dc3d4b79-6cd8-324b-bc70-cbd0e2a066da
28617035-7557-3cb9-99c2-754f72fd34b4
924116d9-0a48-3d97-b8c9-0d16b087c16a
3e7c4d87-dba1-3e22-a303-4f402f89cd20
2451c219-3002-3b2e-8fa9-2b7fea168b3b
5d9c1080-e6e9-3222-96a2-37ca7286a874
aa105408-2974-35e7-ae76-35060cfde21a
9efe1171-6faf-3427-8451-8f6469f7678e
9441ffdd-f06e-36e0-839e-b836b0f19bc9
20f785b0-e11a-3757-be79-b0731286c998
9fd55542-e982-361f-814f-61ad4ad07adf
2583a8ee-867d-3db6-b039-35b913fb8f70
b275d09d-9da2-380b-a748-528ee28bc9af
e10475f7-0d56-3a75-870d-d4206fa165d7
120d7ac7-cce3-359e-a19c-1b9c0abd6be2
226199ab-c791-32a7-8bab-ab92878eb199
adbb2a17-a503-32cd-a9ed-b523b3e4da0b
b8ce75e5-c1d2-3447-9249-70ab3d42389f
3b60751b-7a71-3a47-a743-96b96f0d9b2b
285ac213-8caf-31a4-b0fa-c240580f7f69
988ab841-c422-3d08-bb52-a09f8fdb6ab2
2ec904db-41aa-397c-a1e3-2e2ca0c8e8fb
5fe10166-ab1e-36d5-aa2b-c0d6f680f2c7
ba737c78-2ef2-3643-a5b2-4804dfff9d93
0526e68e-2ff1-3e53-b0f8-45df02e45a93
8934694e-8085-3673-96dd-eacebe691ed1
070bbf42-31d3-3aa9-aca4-c262afc9077d
5f8f4a26-59b1-3f70-bcab-b5e3e615d3bc
7de2e535-81df-3d5f-a5ca-62e4b940eb54
cd22abca-9150-3279-87a4-cb00ba517372
d89f80be-76d0-3853-8daa-76605cf4ce5e
a7636fca-4d9e-3052-bef2-af0ce5d1df74
fbd62533-2d32-3c95-8590-7fd81bd68c87
7a8ec82c-1149-308b-8a12-477460843f35
e35a6aae-3608-38a7-b6e9-b5d6108b921d
5ea3cd9c-15d0-3b80-9cc4-02c8b5ad523a
ed5fc860-c172-39c5-91c0-d712957fb1cd
f2325996-961e-3f63-bbc0-44b7e76aeac9
7905533a-694b-35db-b39f-aec9e33fb3de
c83da752-b12f-3fbd-b728-4abb9551723b
b2d9d8a5-847b-3c3b-aed1-c414319d20af
131bd3d9-4f85-3ba3-b569-eb88308d79d5
e596b305-c951-3081-ae02-85406a473840
eb142141-683a-3a6d-a207-0302b1ff260d
fdc0f552-4976-36a6-8691-9a8c6a5ba389
e68d1f0d-eb44-3751-975d-f80609f695ae
6ee06433-4820-3211-999a-95b79b2c692e
937093d8-7966-3df3-b334-0835595412b6
8940f5f1-13e0-3094-99ba-da2d17639774
919f13de-857f-3b1c-9f8e-7cbe500a60ae
5f5a25ff-ea07-3133-b5c6-26fada93f90f
c93a30c8-168c-386c-a25a-cbd8d8410fbe
e2e921fe-e489-3656-a0a2-5e17bd399ddf
27be7d34-ecb4-377b-8477-ccfd7cf4d0bc
9282db22-c361-3456-a7b5-414959f5f25e
d70660da-4250-3ad1-a2d0-6a2d97b5379f
840b2b3f-5f52-32ae-b833-ad030063533d
87621780-827a-3df5-8fa5-a94267d2d807
307e27f2-6442-39a2-b62c-1e3d000cebaf
84ed050c-635f-36ec-9c28-8a0c10f5cf11


================================================
FILE: tools/data_converter/nusc_split.py
================================================
TRAIN_SCENES = [
    "scene-0002", "scene-0003", "scene-0004", "scene-0005", "scene-0006", 
    "scene-0007", "scene-0008", "scene-0009", "scene-0012", "scene-0013", 
    "scene-0014", "scene-0015", "scene-0016", "scene-0017", "scene-0018", 
    "scene-0019", "scene-0021", "scene-0022", "scene-0023", "scene-0024", 
    "scene-0025", "scene-0026", "scene-0027", "scene-0028", "scene-0029", 
    "scene-0030", "scene-0031", "scene-0032", "scene-0033", "scene-0034", 
    "scene-0035", "scene-0036", "scene-0039", "scene-0042", "scene-0043", 
    "scene-0044", "scene-0045", "scene-0046", "scene-0047", "scene-0048", 
    "scene-0049", "scene-0050", "scene-0051", "scene-0052", "scene-0055", 
    "scene-0056", "scene-0057", "scene-0058", "scene-0059", "scene-0060", 
    "scene-0061", "scene-0062", "scene-0063", "scene-0064", "scene-0065", 
    "scene-0066", "scene-0067", "scene-0068", "scene-0069", "scene-0070", 
    "scene-0071", "scene-0072", "scene-0073", "scene-0074", "scene-0075", 
    "scene-0076", "scene-0092", "scene-0093", "scene-0094", "scene-0095", 
    "scene-0096", "scene-0097", "scene-0098", "scene-0099", "scene-0100", 
    "scene-0101", "scene-0102", "scene-0103", "scene-0104", "scene-0105", 
    "scene-0106", "scene-0107", "scene-0108", "scene-0109", "scene-0110", 
    "scene-0120", "scene-0123", "scene-0124", "scene-0125", "scene-0126", 
    "scene-0127", "scene-0128", "scene-0129", "scene-0130", "scene-0131", 
    "scene-0132", "scene-0133", "scene-0134", "scene-0135", "scene-0138", 
    "scene-0149", "scene-0150", "scene-0151", "scene-0154", "scene-0155", 
    "scene-0157", "scene-0158", "scene-0159", "scene-0161", "scene-0162", 
    "scene-0163", "scene-0164", "scene-0165", "scene-0166", "scene-0167", 
    "scene-0168", "scene-0170", "scene-0171", "scene-0172", "scene-0173", 
    "scene-0174", "scene-0175", "scene-0176", "scene-0177", "scene-0178", 
    "scene-0179", "scene-0180", "scene-0181", "scene-0182", "scene-0183", 
    "scene-0185", "scene-0187", "scene-0188", "scene-0190", "scene-0191", 
    "scene-0192", "scene-0193", "scene-0194", "scene-0195", "scene-0196", 
    "scene-0199", "scene-0200", "scene-0202", "scene-0203", "scene-0204", 
    "scene-0206", "scene-0207", "scene-0208", "scene-0209", "scene-0210", 
    "scene-0211", "scene-0212", "scene-0213", "scene-0214", "scene-0218", 
    "scene-0219", "scene-0220", "scene-0221", "scene-0222", "scene-0224", 
    "scene-0225", "scene-0226", "scene-0227", "scene-0228", "scene-0229", 
    "scene-0230", "scene-0231", "scene-0232", "scene-0233", "scene-0234", 
    "scene-0235", "scene-0236", "scene-0237", "scene-0238", "scene-0239", 
    "scene-0240", "scene-0241", "scene-0242", "scene-0243", "scene-0244", 
    "scene-0245", "scene-0246", "scene-0247", "scene-0248", "scene-0249", 
    "scene-0250", "scene-0251", "scene-0252", "scene-0253", "scene-0254", 
    "scene-0255", "scene-0256", "scene-0257", "scene-0258", "scene-0259", 
    "scene-0260", "scene-0261", "scene-0262", "scene-0263", "scene-0264", 
    "scene-0268", "scene-0270", "scene-0271", "scene-0272", "scene-0273", 
    "scene-0274", "scene-0275", "scene-0276", "scene-0277", "scene-0278", 
    "scene-0283", "scene-0284", "scene-0285", "scene-0286", "scene-0287", 
    "scene-0288", "scene-0289", "scene-0290", "scene-0291", "scene-0292", 
    "scene-0293", "scene-0294", "scene-0295", "scene-0296", "scene-0297", 
    "scene-0298", "scene-0299", "scene-0300", "scene-0301", "scene-0302", 
    "scene-0303", "scene-0304", "scene-0305", "scene-0306", "scene-0315", 
    "scene-0316", "scene-0317", "scene-0318", "scene-0321", "scene-0323", 
    "scene-0324", "scene-0328", "scene-0329", "scene-0330", "scene-0331", 
    "scene-0332", "scene-0344", "scene-0345", "scene-0346", "scene-0349", 
    "scene-0350", "scene-0351", "scene-0352", "scene-0353", "scene-0354", 
    "scene-0355", "scene-0356", "scene-0357", "scene-0358", "scene-0359", 
    "scene-0360", "scene-0361", "scene-0362", "scene-0363", "scene-0364", 
    "scene-0365", "scene-0367", "scene-0370", "scene-0371", "scene-0372", 
    "scene-0373", "scene-0374", "scene-0375", "scene-0376", "scene-0377", 
    "scene-0379", "scene-0380", "scene-0381", "scene-0382", "scene-0383", 
    "scene-0384", "scene-0385", "scene-0386", "scene-0388", "scene-0399", 
    "scene-0400", "scene-0401", "scene-0402", "scene-0403", "scene-0405", 
    "scene-0406", "scene-0407", "scene-0408", "scene-0420", "scene-0421", 
    "scene-0422", "scene-0423", "scene-0424", "scene-0425", "scene-0426", 
    "scene-0427", "scene-0428", "scene-0429", "scene-0430", "scene-0431", 
    "scene-0432", "scene-0433", "scene-0434", "scene-0435", "scene-0436", 
    "scene-0437", "scene-0438", "scene-0439", "scene-0440", "scene-0441", 
    "scene-0442", "scene-0443", "scene-0444", "scene-0445", "scene-0446", 
    "scene-0447", "scene-0448", "scene-0449", "scene-0450", "scene-0451", 
    "scene-0452", "scene-0453", "scene-0454", "scene-0455", "scene-0456", 
    "scene-0457", "scene-0458", "scene-0459", "scene-0461", "scene-0462", 
    "scene-0463", "scene-0464", "scene-0465", "scene-0467", "scene-0468", 
    "scene-0469", "scene-0471", "scene-0472", "scene-0474", "scene-0475", 
    "scene-0476", "scene-0477", "scene-0478", "scene-0479", "scene-0480", 
    "scene-0499", "scene-0500", "scene-0501", "scene-0502", "scene-0504", 
    "scene-0505", "scene-0506", "scene-0507", "scene-0508", "scene-0509", 
    "scene-0510", "scene-0511", "scene-0512", "scene-0513", "scene-0514", 
    "scene-0515", "scene-0517", "scene-0518", "scene-0519", "scene-0520", 
    "scene-0521", "scene-0522", "scene-0523", "scene-0524", "scene-0552", 
    "scene-0553", "scene-0554", "scene-0555", "scene-0559", "scene-0560", 
    "scene-0561", "scene-0562", "scene-0563", "scene-0564", "scene-0565", 
    "scene-0584", "scene-0585", "scene-0586", "scene-0587", "scene-0588", 
    "scene-0589", "scene-0590", "scene-0591", "scene-0592", "scene-0593", 
    "scene-0594", "scene-0595", "scene-0596", "scene-0597", "scene-0598", 
    "scene-0599", "scene-0600", "scene-0625", "scene-0626", "scene-0627", 
    "scene-0629", "scene-0630", "scene-0632", "scene-0633", "scene-0634", 
    "scene-0635", "scene-0636", "scene-0637", "scene-0638", "scene-0639", 
    "scene-0640", "scene-0652", "scene-0653", "scene-0654", "scene-0655", 
    "scene-0656", "scene-0657", "scene-0658", "scene-0659", "scene-0660", 
    "scene-0661", "scene-0662", "scene-0663", "scene-0664", "scene-0665", 
    "scene-0666", "scene-0667", "scene-0668", "scene-0669", "scene-0670", 
    "scene-0671", "scene-0672", "scene-0673", "scene-0674", "scene-0675", 
    "scene-0676", "scene-0677", "scene-0678", "scene-0679", "scene-0681", 
    "scene-0683", "scene-0684", "scene-0685", "scene-0686", "scene-0687", 
    "scene-0688", "scene-0689", "scene-0695", "scene-0696", "scene-0697", 
    "scene-0698", "scene-0700", "scene-0701", "scene-0703", "scene-0704", 
    "scene-0705", "scene-0706", "scene-0707", "scene-0708", "scene-0709", 
    "scene-0710", "scene-0711", "scene-0712", "scene-0713", "scene-0714", 
    "scene-0715", "scene-0716", "scene-0717", "scene-0718", "scene-0719", 
    "scene-0726", "scene-0727", "scene-0728", "scene-0730", "scene-0731", 
    "scene-0733", "scene-0734", "scene-0735", "scene-0736", "scene-0737", 
    "scene-0738", "scene-0780", "scene-0781", "scene-0782", "scene-0783", 
    "scene-0784", "scene-0786", "scene-0787", "scene-0789", "scene-0790", 
    "scene-0791", "scene-0792", "scene-0802", "scene-0806", "scene-0808", 
    "scene-0809", "scene-0810", "scene-0811", "scene-0812", "scene-0813", 
    "scene-0815", "scene-0816", "scene-0817", "scene-0819", "scene-0820", 
    "scene-0821", "scene-0822", "scene-0847", "scene-0848", "scene-0849", 
    "scene-0850", "scene-0851", "scene-0852", "scene-0853", "scene-0854", 
    "scene-0855", "scene-0856", "scene-0858", "scene-0860", "scene-0861", 
    "scene-0862", "scene-0863", "scene-0864", "scene-0865", "scene-0866", 
    "scene-0868", "scene-0869", "scene-0870", "scene-0871", "scene-0872", 
    "scene-0873", "scene-0875", "scene-0876", "scene-0877", "scene-0878", 
    "scene-0880", "scene-0882", "scene-0883", "scene-0884", "scene-0885", 
    "scene-0886", "scene-0887", "scene-0888", "scene-0889", "scene-0890", 
    "scene-0891", "scene-0892", "scene-0893", "scene-0894", "scene-0895", 
    "scene-0896", "scene-0897", "scene-0898", "scene-0899", "scene-0900", 
    "scene-0901", "scene-0902", "scene-0903", "scene-0904", "scene-0905", 
    "scene-0906", "scene-0907", "scene-0908", "scene-0909", "scene-0916", 
    "scene-0917", "scene-0921", "scene-0922", "scene-0923", "scene-0925", 
    "scene-0926", "scene-0927", "scene-0928", "scene-0929", "scene-0930", 
    "scene-0931", "scene-0945", "scene-0947", "scene-0949", "scene-0952", 
    "scene-0953", "scene-0955", "scene-0956", "scene-0957", "scene-0958", 
    "scene-0959", "scene-0960", "scene-0961", "scene-0966", "scene-0967", 
    "scene-0968", "scene-0969", "scene-0971", "scene-0972", "scene-0975", 
    "scene-0976", "scene-0977", "scene-0978", "scene-0979", "scene-0980", 
    "scene-0981", "scene-0982", "scene-0983", "scene-0984", "scene-0988", 
    "scene-0989", "scene-0990", "scene-0991", "scene-0992", "scene-0994", 
    "scene-0995", "scene-0996", "scene-0997", "scene-0998", "scene-0999", 
    "scene-1000", "scene-1001", "scene-1004", "scene-1005", "scene-1006", 
    "scene-1007", "scene-1008", "scene-1009", "scene-1010", "scene-1011", 
    "scene-1012", "scene-1013", "scene-1014", "scene-1015", "scene-1019", 
    "scene-1020", "scene-1021", "scene-1022", "scene-1023", "scene-1024", 
    "scene-1025", "scene-1044", "scene-1045", "scene-1046", "scene-1047", 
    "scene-1048", "scene-1049", "scene-1050", "scene-1051", "scene-1052", 
    "scene-1053", "scene-1054", "scene-1064", "scene-1065", "scene-1066", 
    "scene-1067", "scene-1068", "scene-1069", "scene-1070", "scene-1071", 
    "scene-1072", "scene-1073", "scene-1074", "scene-1075", "scene-1076", 
    "scene-1077", "scene-1078", "scene-1079", "scene-1080", "scene-1081", 
    "scene-1082", "scene-1083", "scene-1084", "scene-1085", "scene-1086", 
    "scene-1087", "scene-1088", "scene-1089", "scene-1090", "scene-1091", 
    "scene-1092", "scene-1093", "scene-1094", "scene-1095", "scene-1096", 
    "scene-1097", "scene-1098", "scene-1099", "scene-1100", "scene-1101", 
    "scene-1102", "scene-1104", "scene-1105", "scene-1106", "scene-1107", 
    "scene-1108", "scene-1109", "scene-1110"]

VAL_SCENES = [
    "scene-0001", "scene-0010", "scene-0011", "scene-0020", "scene-0038", 
    "scene-0041", "scene-0053", "scene-0054", "scene-0121", "scene-0122", 
    "scene-0139", "scene-0152", "scene-0160", "scene-0184", "scene-0269", 
    "scene-0347", "scene-0348", "scene-0366", "scene-0368", "scene-0369", 
    "scene-0378", "scene-0389", "scene-0390", "scene-0391", "scene-0392", 
    "scene-0393", "scene-0394", "scene-0395", "scene-0396", "scene-0397", 
    "scene-0398", "scene-0411", "scene-0412", "scene-0413", "scene-0414", 
    "scene-0415", "scene-0416", "scene-0417", "scene-0418", "scene-0419", 
    "scene-0525", "scene-0526", "scene-0527", "scene-0528", "scene-0529", 
    "scene-0530", "scene-0531", "scene-0532", "scene-0533", "scene-0534", 
    "scene-0535", "scene-0536", "scene-0537", "scene-0538", "scene-0539", 
    "scene-0541", "scene-0542", "scene-0543", "scene-0544", "scene-0545", 
    "scene-0546", "scene-0556", "scene-0557", "scene-0558", "scene-0566", 
    "scene-0568", "scene-0570", "scene-0571", "scene-0572", "scene-0573", 
    "scene-0574", "scene-0575", "scene-0576", "scene-0577", "scene-0578", 
    "scene-0580", "scene-0582", "scene-0583", "scene-0642", "scene-0643", 
    "scene-0644", "scene-0645", "scene-0646", "scene-0647", "scene-0648", 
    "scene-0649", "scene-0650", "scene-0651", "scene-0739", "scene-0740", 
    "scene-0741", "scene-0744", "scene-0746", "scene-0747", "scene-0749", 
    "scene-0750", "scene-0751", "scene-0752", "scene-0757", "scene-0758", 
    "scene-0759", "scene-0760", "scene-0761", "scene-0762", "scene-0763", 
    "scene-0764", "scene-0765", "scene-0767", "scene-0768", "scene-0769", 
    "scene-0770", "scene-0771", "scene-0775", "scene-0777", "scene-0778", 
    "scene-0794", "scene-0795", "scene-0796", "scene-0797", "scene-0798", 
    "scene-0799", "scene-0800", "scene-0803", "scene-0804", "scene-0911", 
    "scene-0912", "scene-0913", "scene-0914", "scene-0915", "scene-0919", 
    "scene-0920", "scene-0924", "scene-0962", "scene-0963", "scene-1002", 
    "scene-1003", "scene-1016", "scene-1017", "scene-1018", "scene-1055", 
    "scene-1056", "scene-1057", "scene-1058", "scene-1059", "scene-1060", 
    "scene-1061", "scene-1062", "scene-1063"]


CALIBRATION_SCENES = [
    "scene-0852", "scene-0429", "scene-0956", "scene-0194", "scene-0811", 
    "scene-1110", "scene-1107", "scene-0294", "scene-0900", "scene-0596", 
    "scene-0296", "scene-0885", "scene-0866", "scene-0105", "scene-0782", 
    "scene-0191", "scene-0876", "scene-0133", "scene-0231", "scene-0847", 
    "scene-0363", "scene-0026", "scene-0791", "scene-0909", "scene-0002", 
    "scene-0283", "scene-0007", "scene-0251", "scene-1100", "scene-0668", 
    "scene-0584", "scene-0287", "scene-0260", "scene-0171", "scene-0789", 
    "scene-0108", "scene-0190", "scene-0206", "scene-0635", "scene-0815", 
    "scene-0058", "scene-0710", "scene-0302", "scene-0639", "scene-0166", 
    "scene-0094", "scene-0735", "scene-0321", "scene-1091", "scene-0344"
]

================================================
FILE: tools/data_converter/nuscenes_converter.py
================================================
import mmcv
import numpy as np
from os import path as osp
from pyquaternion import Quaternion
import argparse
from nusc_split import TRAIN_SCENES, VAL_SCENES

nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
                  'barrier')

nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
                  'pedestrian.moving', 'pedestrian.standing',
                  'pedestrian.sitting_lying_down', 'vehicle.moving',
                  'vehicle.parked', 'vehicle.stopped', 'None')

FAIL_SCENES = ['scene-0499', 'scene-0502', 'scene-0515', 'scene-0517']

def parse_args():
    parser = argparse.ArgumentParser(description='Data converter arg parser')
    parser.add_argument(
        '--data-root',
        type=str,
        help='specify the root path of dataset')
    parser.add_argument(
        '--newsplit',
        action='store_true')
    parser.add_argument(
        '-v','--version',
        choices=['v1.0-mini', 'v1.0-trainval', 'v1.0-test'],
        default='v1.0-trainval')
    
    args = parser.parse_args()
    return args

def create_nuscenes_infos_map(root_path,
                            dest_path=None,
                            info_prefix='nuscenes',
                            version='v1.0-trainval',
                            new_split=False):
    """Create info file for map learning task on nuscene dataset.

    Given the raw data, generate its related info file in pkl format.

    Args:
        root_path (str): Path of the data root.
        info_prefix (str): Prefix of the info file to be generated.
        version (str): Version of the data.
            Default: 'v1.0-trainval'
    """
    from nuscenes.nuscenes import NuScenes
    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
    from nuscenes.utils import splits
    assert version in ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
    if version == 'v1.0-trainval':
        train_scenes = splits.train
        val_scenes = splits.val
    elif version == 'v1.0-test':
        train_scenes = splits.test
        val_scenes = []
    else:
        train_scenes = splits.mini_train
        val_scenes = splits.mini_val
    
    if new_split:
        train_scenes = TRAIN_SCENES
        val_scenes = VAL_SCENES

    test = 'test' in version
    if test:
        print('test scene: {}'.format(len(train_scenes)))
    else:
        print('train scene: {}, val scene: {}'.format(
            len(train_scenes), len(val_scenes)))
    
    train_samples, val_samples, test_samples = [], [], []
    
    train_sample_idx = 0
    val_sample_idx = 0
    for sample in mmcv.track_iter_progress(nusc.sample):
        lidar_token = sample['data']['LIDAR_TOP']
        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
        cs_record = nusc.get('calibrated_sensor',
                             sd_rec['calibrated_sensor_token'])
        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)

        #mmcv.check_file_exist(lidar_path)

        scene_record = nusc.get('scene', sample['scene_token'])
        log_record = nusc.get('log', scene_record['log_token'])
        location = log_record['location']
        scene_name = scene_record['name']
        if scene_name in FAIL_SCENES: continue
        info = {
            'lidar_path': lidar_path,
            'token': sample['token'],
            'cams': {},
            
            'lidar2ego_translation': cs_record['translation'],
            'lidar2ego_rotation': cs_record['rotation'],
            'e2g_translation': pose_record['translation'],
            'e2g_rotation': pose_record['rotation'],
            'timestamp': sample['timestamp'],
            'location': location,
            'scene_name': scene_name
        }

        # obtain 6 image's information per frame
        camera_types = [
            'CAM_FRONT',
            'CAM_FRONT_RIGHT',
            'CAM_FRONT_LEFT',
            'CAM_BACK',
            'CAM_BACK_LEFT',
            'CAM_BACK_RIGHT',
        ]
        for cam in camera_types:
            cam_token = sample['data'][cam]
            sd_rec = nusc.get('sample_data', cam_token)
            cs_record = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])

            cam2ego_rotation = Quaternion(cs_record['rotation']).rotation_matrix
            cam2ego_translation = np.array(cs_record['translation'])

            ego2cam_rotation = cam2ego_rotation.T
            ego2cam_translation = ego2cam_rotation.dot(-cam2ego_translation)

            transform_matrix = np.eye(4) #ego2cam
            transform_matrix[:3, :3] = ego2cam_rotation
            transform_matrix[:3, 3] = ego2cam_translation

            cam_info = dict(
                extrinsics=transform_matrix, # ego2cam
                intrinsics=cs_record['camera_intrinsic'],
                img_fpath=str(nusc.get_sample_data_path(sd_rec['token']))
            )
            info['cams'][cam] = cam_info
        
        if scene_name in train_scenes:
            info.update({
                'sample_idx': train_sample_idx,
                'prev': train_sample_idx - 1,
                'next': train_sample_idx + 1,
            })
            if sample['prev'] == '':
                info['prev'] = -1
            if sample['next'] == '':
                info['next'] = -1
            train_samples.append(info)
            train_sample_idx += 1
        elif scene_name in val_scenes:
            info.update({
                'sample_idx': val_sample_idx,
                'prev': val_sample_idx - 1,
                'next': val_sample_idx + 1,
            })
            if sample['prev'] == '':
                info['prev'] = -1
            if sample['next'] == '':
                info['next'] = -1
            val_sample_idx += 1
            val_samples.append(info)
        else:
            test_samples.append(info)
    
    if dest_path is None:
        dest_path = root_path
    
    if test:
        info_path = osp.join(dest_path, f'{info_prefix}_map_infos_test.pkl')
        print(f'saving test set to {info_path}')
        mmcv.dump(test_samples, info_path)

    else:
        # for training set
        if new_split:
            info_path = osp.join(dest_path, f'{info_prefix}_map_infos_train_newsplit.pkl')
        else:
            info_path = osp.join(dest_path, f'{info_prefix}_map_infos_train.pkl')
        print(f'saving training set to {info_path}')
        mmcv.dump(train_samples, info_path)

        # for val set
        if new_split:
            info_path = osp.join(dest_path, f'{info_prefix}_map_infos_val_newsplit.pkl')
        else:
            info_path = osp.join(dest_path, f'{info_prefix}_map_infos_val.pkl')
        print(f'saving validation set to {info_path}')
        mmcv.dump(val_samples, info_path)


if __name__ == '__main__':
    args = parse_args()

    create_nuscenes_infos_map(root_path=args.data_root, version=args.version, new_split=args.newsplit)

================================================
FILE: tools/dist_test.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29500}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}


================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-29500}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}


================================================
FILE: tools/mmdet_test.py
================================================
import os.path as osp
import pickle
import shutil
import tempfile
import time

import mmcv
import torch
import torch.distributed as dist
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info

from mmdet.core import encode_mask_results


def single_gpu_test(model,
                    data_loader,
                    show=False,
                    out_dir=None,
                    show_score_thr=0.3):
    model.eval()
    results = []
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)

        batch_size = len(result)
        if show or out_dir:
            if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
                img_tensor = data['img'][0]
            else:
                img_tensor = data['img'][0].data[0]
            img_metas = data['img_metas'][0].data[0]
            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
            assert len(imgs) == len(img_metas)

            for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
                h, w, _ = img_meta['img_shape']
                img_show = img[:h, :w, :]

                ori_h, ori_w = img_meta['ori_shape'][:-1]
                img_show = mmcv.imresize(img_show, (ori_w, ori_h))

                if out_dir:
                    out_file = osp.join(out_dir, img_meta['ori_filename'])
                else:
                    out_file = None

                model.module.show_result(
                    img_show,
                    result[i],
                    show=show,
                    out_file=out_file,
                    score_thr=show_score_thr)

        # encode mask results
        if isinstance(result[0], tuple):
            result = [(bbox_results, encode_mask_results(mask_results))
                      for bbox_results, mask_results in result]
        results.extend(result)

        for _ in range(batch_size):
            prog_bar.update()
    return results


def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
    """Test model with multiple gpus.

    This method tests model with multiple gpus and collects the results
    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
    it encodes results to gpu tensors and use gpu communication for results
    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
    and collects them by the rank 0 worker.

    Args:
        model (nn.Module): Model to be tested.
        data_loader (nn.Dataloader): Pytorch data loader.
        tmpdir (str): Path of directory to save the temporary results from
            different gpus under cpu mode.
        gpu_collect (bool): Option to use either gpu or cpu to collect results.

    Returns:
        list: The prediction results.
    """
    model.eval()
    results = []
    dataset = data_loader.dataset
    rank, world_size = get_dist_info()
    if rank == 0:
        prog_bar = mmcv.ProgressBar(len(dataset))
    time.sleep(2)  # This line can prevent deadlock problem in some cases.
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)
            # encode mask results
            # if isinstance(result[0], tuple):
            #     result = [(bbox_results, encode_mask_results(mask_results))
            #               for bbox_results, mask_results in result]
        results.extend(result)

        if rank == 0:
            batch_size = len(result)
            for _ in range(batch_size * world_size):
                prog_bar.update()

    # collect results from all ranks
    if gpu_collect:
        results = collect_results_gpu(results, len(dataset))
    else:
        results = collect_results_cpu(results, len(dataset), tmpdir)
    return results


def collect_results_cpu(result_part, size, tmpdir=None):
    rank, world_size = get_dist_info()
    # create a tmp dir if it is not specified
    if tmpdir is None:
        MAX_LEN = 512
        # 32 is whitespace
        dir_tensor = torch.full((MAX_LEN, ),
                                32,
                                dtype=torch.uint8,
                                device='cuda')
        if rank == 0:
            mmcv.mkdir_or_exist('.dist_test')
            tmpdir = tempfile.mkdtemp(dir='.dist_test')
            tmpdir = torch.tensor(
                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
            dir_tensor[:len(tmpdir)] = tmpdir
        dist.broadcast(dir_tensor, 0)
        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
    else:
        mmcv.mkdir_or_exist(tmpdir)
    # dump the part result to the dir
    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
    dist.barrier()
    # collect all parts
    if rank != 0:
        return None
    else:
        # load results of all parts from tmp dir
        part_list = []
        for i in range(world_size):
            part_file = osp.join(tmpdir, f'part_{i}.pkl')
            part_list.append(mmcv.load(part_file))
        # sort the results
        ordered_results = []
        for res in zip(*part_list):
            ordered_results.extend(list(res))
        # the dataloader may pad some samples
        ordered_results = ordered_results[:size]
        # remove tmp dir
        shutil.rmtree(tmpdir)
        return ordered_results


def collect_results_gpu(result_part, size):
    rank, world_size = get_dist_info()
    # dump result part to tensor with pickle
    part_tensor = torch.tensor(
        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
    # gather all result part tensor shape
    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
    shape_list = [shape_tensor.clone() for _ in range(world_size)]
    dist.all_gather(shape_list, shape_tensor)
    # padding result part tensor to max length
    shape_max = torch.tensor(shape_list).max()
    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
    part_send[:shape_tensor[0]] = part_tensor
    part_recv_list = [
        part_tensor.new_zeros(shape_max) for _ in range(world_size)
    ]
    # gather all result part
    dist.all_gather(part_recv_list, part_send)

    if rank == 0:
        part_list = []
        for recv, shape in zip(part_recv_list, shape_list):
            part_list.append(
                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
        # sort the results
        ordered_results = []
        for res in zip(*part_list):
            ordered_results.extend(list(res))
        # the dataloader may pad some samples
        ordered_results = ordered_results[:size]
        return ordered_results


================================================
FILE: tools/mmdet_train.py
================================================
import random
import warnings

import numpy as np
import torch
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
                         build_runner)
from mmcv.utils import build_from_cfg

from mmdet.core import DistEvalHook, EvalHook
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.utils import get_root_logger


def set_random_seed(seed, deterministic=False):
    """Set random seed.

    Args:
        seed (int): Seed to be used.
        deterministic (bool): Whether to set the deterministic option for
            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
            to True and `torch.backends.cudnn.benchmark` to False.
            Default: False.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(
            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    if 'runner' not in cfg:
        cfg.runner = {
            'type': 'EpochBasedRunner',
            'max_epochs': cfg.total_epochs
        }
        warnings.warn(
            'config is now expected to have a `runner` section, '
            'please set `runner` in your config.', UserWarning)
    else:
        if 'total_epochs' in cfg:
            assert cfg.total_epochs == cfg.runner.max_epochs

    runner = build_runner(
        cfg.runner,
        default_args=dict(
            model=model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta))

    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        if isinstance(runner, EpochBasedRunner):
            runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow)


================================================
FILE: tools/slurm_test.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}


================================================
FILE: tools/slurm_train.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
WORK_DIR=$4
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
SRUN_ARGS=${SRUN_ARGS:-""}
PY_ARGS=${@:5}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}


================================================
FILE: tools/test.py
================================================
import argparse
import mmcv
import os
import os.path as osp
import torch
import warnings
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)

from mmdet3d.apis import single_gpu_test
from mmdet3d.datasets import build_dataset
from mmdet3d.models import build_model
# from mmdet_test import multi_gpu_test
from mmdet_train import set_random_seed
from mmdet.datasets import replace_ImageToTensor


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', type=str, help='checkpoint file')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--result-path', 
        help='submission file in pickle format to be evaluated')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
        'useful when you want to format the result to a specific format and '
        'submit it to the test server')
    parser.add_argument(
        '--eval',
        action='store_true',
        help='whether to run evaluation.')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where results will be saved')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
        'workers, available when gpu-collect is not specified')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function (deprecate), '
        'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both specified, '
            '--options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.eval or args.format_only or args.show \
        or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # import modules from plguin/xx, registry will be updated
    import sys
    sys.path.append(os.path.abspath('.'))  
    if hasattr(cfg, 'plugin'):
        if cfg.plugin:
            import importlib
            if hasattr(cfg, 'plugin_dir'):
                def import_path(plugin_dir):
                    _module_dir = os.path.dirname(plugin_dir)
                    _module_dir = _module_dir.split('/')
                    _module_path = _module_dir[0]

                    for m in _module_dir[1:]:
                        _module_path = _module_path + '.' + m
                    print(_module_path)
                    plg_lib = importlib.import_module(_module_path)

                plugin_dirs = cfg.plugin_dir
                if not isinstance(plugin_dirs,list):
                    plugin_dirs = [plugin_dirs,]
                for plugin_dir in plugin_dirs:
                    import_path(plugin_dir)
                
            else:
                # import dir is the dirpath for the config file
                _module_dir = os.path.dirname(args.config)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]
                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                print(_module_path)
                plg_lib = importlib.import_module(_module_path)

    cfg.model.pretrained = None
    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # set random seeds
    if args.seed is not None:
        set_random_seed(args.seed, deterministic=args.deterministic)

    # build the dataloader
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0]) 

    cfg.data.test.work_dir = cfg.work_dir
    print('work_dir: ',cfg.work_dir)
    dataset = build_dataset(cfg.data.test)
    
    if args.result_path:
        outputs = args.result_path
        dataset._evaluate(args.result_path)
        return
    
    from plugin.datasets.builder import build_dataloader

    data_loader = build_dataloader(
            dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False,
            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
        )
    
    from plugin.core.apis.test import custom_multi_gpu_test as multi_gpu_test
    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    # embed()

    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    if not distributed:
        model = MMDataParallel(model, device_ids=[0])
        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
    else:
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False)
        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
                                args.gpu_collect)

    rank, _ = get_dist_info()
    if rank == 0:
        kwargs = {} if args.eval_options is None else args.eval_options
        if args.format_only:
            dataset.format_results(outputs, **kwargs)
        if args.eval:
            eval_kwargs = cfg.get('evaluation', {}).copy()
            if args.eval_options is not None:
                eval_kwargs.update(args.eval_options)
            # hard-code way to remove EvalHook args
            for key in [
                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
                    'rule'
            ]:
                eval_kwargs.pop(key, None)
            print('start evaluation!')
            print(dataset.evaluate(outputs, **eval_kwargs))


if __name__ == '__main__':
    main()


================================================
FILE: tools/tracking/calculate_cmap.py
================================================
import argparse
from mmcv import Config
from mmdet3d.datasets import build_dataset
import cv2
import torch
import numpy as np
import pickle
import time

from cmap_utils.utils import *
from cmap_utils.match_utils import *
from cmap_utils.data_utils import *

font                   = cv2.FONT_HERSHEY_SIMPLEX
location               = (200,60)
fontScale              = 2
fontColor              = (255,0,0)
thickness              = 2
lineType               = 2

cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}

id2cat = {
    0:'ped_crossing',
    1:'divider',
    2:'boundary',
}

COLOR_MAPS_BGR = {
    # bgr colors
    'divider': (0, 0, 255),
    'boundary': (0, 255, 0),
    'ped_crossing': (255, 0, 0),
    'centerline': (51, 183, 255),
    'drivable_area': (171, 255, 255)
}

COLOR_MAPS_PLT = {
    'divider': 'r',
    'boundary': 'g',
    'ped_crossing': 'b',
    'centerline': 'orange',
    'drivable_area': 'y',
}

INTERP_NUM = 200
N_WORKERS = 0

def parse_args():
    parser = argparse.ArgumentParser(
        description='Visualize groundtruth and results')
    parser.add_argument('config', help='config file path')
    parser.add_argument('--thr', 
        type=float,
        default=0.4,
        help='score threshold to filter predictions')
    parser.add_argument(
        '--result_path',
        default=None,
        help='directory to submission file')
    parser.add_argument(
        '--consist',
        default=1,
        type=int,
        help='whether to use the consistent criterion'
    )
    parser.add_argument(
        '--cons_frames',
        default=5,
        help='consective frames for cons metric'
    )
    args = parser.parse_args()
    return args

def instance_match(pred_lines, scores, gt_lines, threshold, metric='chamfer'):
    ### obtain tp,fp,score for a frame based on chamfer distance

    num_preds = pred_lines.shape[0]
    num_gts = gt_lines.shape[0]

    # tp and fp
    tp = np.zeros((num_preds), dtype=np.float32)
    fp = np.zeros((num_preds), dtype=np.float32)

    if num_gts == 0:
        fp[...] = 1
        return (tp.copy(),fp.copy())
    
    if num_preds == 0:
        return (tp.copy(),fp.copy())

    assert pred_lines.shape[1] == gt_lines.shape[1], \
        "sample points num should be the same"

    matrix = np.zeros((num_preds, num_gts))
    matrix = chamfer_distance_batch(pred_lines, gt_lines)
    matrix_min = matrix.min(axis=1)
    matrix_argmin = matrix.argmin(axis=1)
    sort_inds = np.argsort(-scores)
    tp = np.zeros((num_preds), dtype=np.float32)
    fp = np.zeros((num_preds), dtype=np.float32)
    gt_covered = np.zeros(num_gts, dtype=bool)
    for i in sort_inds:
        if matrix_min[i] <= threshold:
            matched_gt = matrix_argmin[i]
            if not gt_covered[matched_gt]:
                gt_covered[matched_gt] = True
                tp[i] = 1
            else:
                fp[i] = 1
        else:
            fp[i] = 1
    return (tp.copy(),fp.copy())

def _evaluate_single(pred_vectors, scores, gt_vectors, threshold, metric='chamfer'):
    ### collect tp-fp-score information

    pred_lines = np.array(pred_vectors)
    gt_lines = np.array(gt_vectors)
    
    if len(pred_lines) == 0 or len(gt_lines)==0:
        tp_fp_score = np.zeros((0,3))
        return tp_fp_score
    scores = np.array(scores)
    tp_fp_list = instance_match(pred_lines, scores, gt_lines, threshold, metric) # (M, 2)

    tp, fp = tp_fp_list
    tp_fp_score = np.hstack([tp[:, None], fp[:, None], scores[:, None]])
    return tp_fp_score

def match_gt_w_pred(curr_data,curr_data_gt,thresh):
    ### find local id matching between predicted vector and gt vectors

    curr_vectors_np = {label: [] for label in cat2id.values()}
    curr_scores_np = {label: [] for label in cat2id.values()}
    for i in range(len(curr_data['labels'])):
        score = curr_data['scores'][i]
        label = curr_data['labels'][i]
        v = curr_data['vectors'][i]
        curr_vectors_np[label].append(v)
        curr_scores_np[label].append(score)
    curr_vectors = {}
    for label, vecs in curr_vectors_np.items():
        if len(vecs) > 0:
            vecs = np.stack(vecs, 0)
            vecs = torch.tensor(vecs)
            curr_vectors[label] = vecs
        else:
            curr_vectors[label] = vecs
    curr_vectors_gt_np = curr_data_gt
    curr_vectors_gt = {}
    for label, vecs in curr_vectors_gt_np.items():
        if len(vecs) > 0:
            vecs_np = []
            for vec in vecs:
                vecs_np.append(vec)
            vecs = np.stack(vecs_np, 0)
            vecs = torch.tensor(vecs)
            curr_vectors_gt[label] = vecs
        else:
            curr_vectors_gt[label] = vecs
    pred2gt_matchings = find_matchings_chamfer(curr_vectors,curr_vectors_gt,curr_scores_np,thresh=thresh)

    return pred2gt_matchings

def get_scene_matching_result(gts,pred_results,scene_name2token,scene_name,thresh=1.5):
    ### obtain local id matching of a scene 

    start_token = scene_name2token[scene_name][0]
    vectors_seq = []
    scores_seq = []
    pred_matching_seq = []
    vectors_gt_seq = []
    pred2gt_matchings_seq = []

    choose_scene = pred_results[start_token]['scene_name']
    for local_idx,token in enumerate(scene_name2token[scene_name]):
        prev_data = pred_results[token]
        gt_vectors = gts[token]

        assert prev_data['scene_name']  == choose_scene
        assert prev_data['local_idx'] == local_idx

        vectors_gt_seq.append(gt_vectors)

        vectors = {label: [] for label in cat2id.values()}
        scores = {label: [] for label in cat2id.values()}
        pred_matching = {label: [] for label in cat2id.values()}
        for i in range(len(prev_data['labels'])):
            score, label, v,pred_glb_id = \
                prev_data['scores'][i], prev_data['labels'][i], prev_data['vectors'][i], prev_data['global_ids'][i]
            vectors[label].append(v)
            scores[label].append(score)
            pred_matching[label].append(pred_glb_id)
        pred_matching_seq.append(pred_matching)
        vectors_seq.append(vectors)
        scores_seq.append(scores)
        pred2gt_matchings = match_gt_w_pred(prev_data,gt_vectors, thresh)
        pred2gt_matchings_seq.append(pred2gt_matchings)

    return vectors_seq, pred_matching_seq, pred2gt_matchings_seq

def pred2gt_global_matching(ids_info,ids_info_gt,pred2gt_seq):
    ### obtain global id matching between predicted vectors and gt vectors of a scene

    pred2gt_global_seq = []
    for frame_idx in range(len(pred2gt_seq)):
        f_match = pred2gt_seq[frame_idx]
        f_ids_info = ids_info[frame_idx]
        f_ids_info_gt = ids_info_gt[frame_idx]
        pred2gt_match_dict = {}
        for label in f_ids_info.keys():
            pred2gt_match_dict[label] = {}
            f_label_match = f_match[label][0]
            f_ids_label_info,f_ids_label_info_gt = f_ids_info[label],f_ids_info_gt[label]
            for pred_match_idx, gt_match_idx in enumerate(f_label_match):
                pred_glb_match_idx = f_ids_label_info[pred_match_idx]

                if gt_match_idx != -1:
                    gt_glb_match_idx = f_ids_label_info_gt[gt_match_idx]
                else:
                    gt_glb_match_idx = -1
                pred2gt_match_dict[label][pred_glb_match_idx] = gt_glb_match_idx
        pred2gt_global_seq.append(pred2gt_match_dict)

    return pred2gt_global_seq

def get_tpfp_from_scene_single(scene_name,args,scene_name2token,pred_results,gts,
        gt_matching,threshold):
    
    ### generate tp-fp list in a single scene
    tpfp_score_record = {0:[],1:[],2:[]}
    scene_gt_matching = gt_matching[scene_name]['instance_ids']

    if args.consist:
        vectors_seq, scene_pred_matching,pred2gt_seq \
            = get_scene_matching_result(gts,pred_results,scene_name2token,scene_name,threshold)
        pred2gt_global_seq = pred2gt_global_matching(scene_pred_matching,scene_gt_matching,pred2gt_seq)

    vectors_seq = []
    scores_seq = []
    gt_flag_dict = {label:{} for label in cat2id.values()}
    for frame_idx, token in enumerate(scene_name2token[scene_name]):
        prev_data = pred_results[token]
        vectors_gt = gts[token]

        vectors = {label: [] for label in cat2id.values()}
        scores = {label: [] for label in cat2id.values()}
        for i in range(len(prev_data['labels'])):
            score, label, v = prev_data['scores'][i], prev_data['labels'][i], prev_data['vectors'][i]
            vectors[label].append(v)
            scores[label].append(score)
        
        for label in cat2id.values():
            tpfp_score = _evaluate_single(vectors[label], scores[label], vectors_gt[label] ,threshold)
            if args.consist:
                #### deal with the consistency part
                for vec_idx,single_tpfp_score in enumerate(tpfp_score):
                    curr_pred2gt_match = pred2gt_global_seq[frame_idx][label]  ### pred_global_id: gt_global_id

                    pred_local2global_mapping = scene_pred_matching[frame_idx][label]
                    match_glb_pred_idx = pred_local2global_mapping[vec_idx]    ### 
                    match_glb_gt_idx = curr_pred2gt_match[match_glb_pred_idx]

                    if match_glb_gt_idx not in gt_flag_dict[label].keys():
                        gt_flag_dict[label][match_glb_gt_idx] = match_glb_pred_idx
                    else:
                        if match_glb_pred_idx != gt_flag_dict[label][match_glb_gt_idx]:
                            tpfp_score[vec_idx][:2] = np.array([0,1])
            tpfp_score_record[label].append(tpfp_score)

        vectors_seq.append(vectors)
        scores_seq.append(scores)

    return tpfp_score_record

def get_mAP(tpfp_score_record,num_gts,threshold):

    ### calculate mean AP given tp-fp-score record
    result_dict = {}
    for cat_name,label in cat2id.items():
        sum_AP = 0
        result_dict[cat_name] = {}
        tp_fp_score = [np.vstack(i[label]) for i in tpfp_score_record]
        tp_fp_score = np.vstack(tp_fp_score)

        sort_inds = np.argsort(-tp_fp_score[:, -1])

        tp = tp_fp_score[sort_inds, 0]
        fp = tp_fp_score[sort_inds, 1]
        tp = np.cumsum(tp, axis=0)
        fp = np.cumsum(fp, axis=0)
        eps = np.finfo(np.float32).eps
        recalls = tp / np.maximum(num_gts[label], eps)
        precisions = tp/np.maximum(tp+fp, eps)

        AP = average_precision(recalls, precisions, 'area')
        sum_AP += AP
        result_dict[cat_name].update({f'AP@{threshold}': AP})
    return result_dict

def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    import_plugin(cfg)
    dataset = build_dataset(cfg.eval_config)

    dataset[0]
    scene_name2idx = {}
    scene_name2token = {}
    for idx, sample in enumerate(dataset.samples):
        scene = sample['scene_name']
        token = sample['token']
        if scene not in scene_name2idx:
            scene_name2idx[scene] = []
            scene_name2token[scene] = []
        scene_name2idx[scene].append(idx)
        scene_name2token[scene].append(token)
    all_scene_names = sorted(list(scene_name2idx.keys()))

    gt_matching_path = cfg.eval_config.ann_file.replace('.pkl','_gt_tracks.pkl',)
    with open(gt_matching_path,'rb') as pf:
        gt_matching = pickle.load(pf)
    

    pred_matching_path = args.result_path
    with open(pred_matching_path,'rb') as ppf:
        pred_matching_result_raw = pickle.load(ppf)

    roi_size = torch.tensor(cfg.roi_size).numpy()
    origin = torch.tensor(cfg.pc_range[:2]).numpy()

    if roi_size[0] == 60:
        thresholds_list = [0.5,1.0,1.5]
    elif roi_size[0] == 100:
        thresholds_list = [1.0, 1.5, 2.0]
    else:
        raise ValueError('roi size {} not supported, check again...'.format(roi_size))

    if 'newsplit' in args.result_path:
        gts = get_gts(dataset,new_split=True)
    else:
        gts = get_gts(dataset)

    ### interpolate vector data
    start_time = time.time()
    denormed_gts,pred_matching_result,num_gts,num_preds = \
        get_data(pred_matching_result_raw,gts,origin,roi_size,INTERP_NUM,result_path=args.result_path,denorm=False)
    print('Preparing Data Time {}'.format(time.time()-start_time))

    ### obtain mAP for each threshold
    scene_name_list = []
    for single_scene_name in all_scene_names:
        scene_name_list.append( (single_scene_name,args) )
    result_dict = {thr:{} for thr in thresholds_list}
    for threshold in thresholds_list:
        tpfp_score_list =[]
        for (scene_name,args) in scene_name_list:
            tpfp_score = get_tpfp_from_scene_single(scene_name,args,scene_name2token,pred_matching_result,
                        denormed_gts,gt_matching,threshold)
            tpfp_score_list.append(tpfp_score)
        result_dict[threshold] = get_mAP(tpfp_score_list,num_gts,threshold)
        print(result_dict[threshold])
    
    cat_mean_AP = np.array([0.,0.,0.])
    mean_AP = 0
    for thr in thresholds_list:
        for cat_name in cat2id.keys():
            mean_AP += result_dict[thr][cat_name]['AP@{}'.format(thr)]
            cat_mean_AP[cat2id[cat_name]] += result_dict[thr][cat_name]['AP@{}'.format(thr)]

    cat_map_dict = {cat:cat_mean_AP[idx]/len(thresholds_list) for cat,idx in cat2id.items() }
    print('Category mean AP',cat_map_dict)
    print('mean AP ',mean_AP/(len(cat2id)*len(thresholds_list)))
    print('Overall Time',time.time()-start_time)

if __name__ == '__main__':
    main()

================================================
FILE: tools/tracking/cmap_utils/__init__.py
================================================


================================================
FILE: tools/tracking/cmap_utils/data_utils.py
================================================
import mmcv
import os
from mmdet3d.datasets import build_dataloader
import numpy as np
from copy import deepcopy
from functools import partial
from multiprocessing import Pool

from .utils import *
from .match_utils import *

cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}

def get_gts(dataset,new_split=False,N_WORKERS=16):
    roi_size = dataset.roi_size
    if 'av2' in dataset.ann_file:
        dataset_name = 'av2'
    else:
        dataset_name = 'nusc'
    if new_split:
        tmp_file = f'./tmp_gts_{dataset_name}_{roi_size[0]}x{roi_size[1]}_newsplit.pkl'
    else:
        tmp_file = f'./tmp_gts_{dataset_name}_{roi_size[0]}x{roi_size[1]}.pkl'
    if os.path.exists(tmp_file):
        print(f'loading cached gts from {tmp_file}')
        gts = mmcv.load(tmp_file)
    else:
        print('collecting gts...')
        gts = {}
        # pdb.set_trace()
        dataloader = build_dataloader(
            dataset, samples_per_gpu=1, workers_per_gpu=N_WORKERS, shuffle=False, dist=False)
        pbar = mmcv.ProgressBar(len(dataloader))
        for data in dataloader:
            token = deepcopy(data['img_metas'].data[0][0]['token'])
            gt = deepcopy(data['vectors'].data[0][0])
            # pdb.set_trace()
            gts[token] = gt
            pbar.update()
            del data # avoid dataloader memory crash
    
    for token, gt in gts.items():
        for label, vectors in gt.items():
            label_vecs = []
            for vec in vectors:
                label_vecs.append(interp_fixed_num(vec,20))
            gt[label] = label_vecs
        gts[token] = gt
    return gts

def prepare_data_multi(token,idx,pred,gts,origin,roi_size,interp_num,dataset,denorm=False):
    num_gts = np.array([0,0,0])
    num_preds = np.array([0,0,0])
    denorm_gt = {}

    gt = gts[token]
    denorm_gt = {label:[] for label in cat2id.values()}
    scores_by_cls = {label: [] for label in cat2id.values()}

    vector_list = []
    for i in range(len(pred['labels'])):
        score = pred['scores'][i]
        vector = pred['vectors'][i].reshape(-1,2)
        label = pred['labels'][i]
        scores_by_cls[label].append(score)
        if not denorm:
            vector_list.append(interp_fixed_num(vector,interp_num))
        else:
            vector_list.append(interp_fixed_num(vector*roi_size+origin,interp_num))

    for label in cat2id.values():
        for vec in gt[label]:
            denorm_gt[label].append(interp_fixed_num(vec,interp_num))

    for label in cat2id.values():
        num_gts[label] += len(gt[label])
        num_preds[label] += len(scores_by_cls[label])
    return token,idx,denorm_gt, vector_list, num_gts,num_preds

def get_data(pred_matching_result_raw,gts,origin,roi_size,num_interp,result_path,denorm=False):
    ### collect data, interpolate with multi_processing
    token_list = []
    for idx,pred_res in enumerate(pred_matching_result_raw):
        token = pred_res['meta']['token']
        token_list.append( (token,idx,pred_matching_result_raw[idx]) )
    dataset = 'av2' if 'av2' in result_path else 'nusc'
    fn = partial(prepare_data_multi,gts=gts,origin=origin,roi_size=roi_size,interp_num=num_interp,dataset=dataset,denorm=denorm)

    denormed_gts = {}
    pred_matching_result = {}
    num_gts = np.zeros(3)
    num_preds = np.zeros(3)
    with Pool(processes=16) as pool:
        data_infos = pool.starmap(fn,token_list)
    for data_info in data_infos:
        token,idx, denorm_gt,pred_vector, num_gts_single,num_preds_single = data_info
        denormed_gts[token] = denorm_gt
        pred_matching_result_raw[idx]['vectors'] = pred_vector
        pred_matching_result[token] = pred_matching_result_raw[idx]
        num_gts  = num_gts + num_gts_single
        num_preds = num_preds + num_preds_single


    return denormed_gts,pred_matching_result,num_gts,num_preds

================================================
FILE: tools/tracking/cmap_utils/match_utils.py
================================================
import torch
import numpy as np
from scipy.optimize import linear_sum_assignment

from .utils import *

cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}

def get_prev2curr_matrix(prev_meta,curr_meta):
    # get relative pose
    prev_e2g_trans = torch.tensor(prev_meta['ego2global_translation'], dtype=torch.float64)
    prev_e2g_rot = torch.tensor(prev_meta['ego2global_rotation'], dtype=torch.float64)
    curr_e2g_trans = torch.tensor(curr_meta['ego2global_translation'], dtype=torch.float64)
    curr_e2g_rot = torch.tensor(curr_meta['ego2global_rotation'], dtype=torch.float64)
    
    prev_e2g_matrix = torch.eye(4, dtype=torch.float64)
    prev_e2g_matrix[:3, :3] = prev_e2g_rot
    prev_e2g_matrix[:3, 3] = prev_e2g_trans

    curr_g2e_matrix = torch.eye(4, dtype=torch.float64)
    curr_g2e_matrix[:3, :3] = curr_e2g_rot.T
    curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans)

    prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix
    return prev2curr_matrix


def find_matchings_iou(src_masks, tgt_masks, thresh=0.1):
    """Find the matching of map elements between two temporally 
    connected frame

    Args:
        src_masks (_type_): instance masks of prev frame
        tgt_masks (_type_): instance masks of current frame
        thresh (float, optional): IOU threshold for matching. Defaults to 0.1.
    """
    def _mask_iou(mask1, mask2):
        intersection = (mask1 * mask2).sum()
        if intersection == 0:
            return 0.0
        union = np.logical_or(mask1, mask2).sum()
        return intersection / union
    
    matchings = {}
    for label, src_instances in src_masks.items():
        tgt_instances = tgt_masks[label]
        cost = np.zeros([len(src_instances), len(tgt_instances)])
        for i, src_ins in enumerate(src_instances):
            for j, tgt_ins in enumerate(tgt_instances):
                iou = _mask_iou(src_ins, tgt_ins)
                cost[i, j] = -iou
        row_ind, col_ind = linear_sum_assignment(cost)
        
        label_matching = [-1 for _ in range(len(src_instances))]
        label_matching_reverse = [-1 for _ in range(len(tgt_instances))]

        for i, j in zip(row_ind, col_ind):
            if -cost[i, j] > thresh:
                label_matching[i] = j
                label_matching_reverse[j] = i
        
        matchings[label] = (label_matching, label_matching_reverse)
    return matchings

def find_matchings_chamfer(pred_vectors, gt_vectors, score_dict,thresh=0.5):
    matchings = {}
    for label, src_instances in pred_vectors.items():
        tgt_instances = gt_vectors[label]
        num_gts = len(tgt_instances)
        num_preds = len(src_instances)
        label_matching = [-1 for _ in range(len(src_instances))]
        label_matching_reverse = [-1 for _ in range(len(tgt_instances))]
        if len(src_instances) == 0 or len(tgt_instances)==0:
            matchings[label] = (label_matching, label_matching_reverse)
            continue
        cdist = chamfer_distance_batch(src_instances, tgt_instances)
        label_score = np.array(score_dict[label])
        matrix_min = cdist.min(axis=1)

        # for each det, which gt is the closest to it
        matrix_argmin = cdist.argmin(axis=1)
        sort_inds = np.argsort(-label_score)
        gt_covered = np.zeros(num_gts, dtype=bool)

        tp = np.zeros((num_preds), dtype=np.float32)
        fp = np.zeros((num_preds), dtype=np.float32)
        for i in sort_inds:
            if matrix_min[i] <= thresh:
                matched_gt = matrix_argmin[i]
                if not gt_covered[matched_gt]:
                    gt_covered[matched_gt] = True
                    label_matching[i] = matched_gt
                    label_matching_reverse[matched_gt] = i
        matchings[label] = (label_matching, label_matching_reverse)
    return matchings

def get_consecutive_vectors(prev_vectors,curr_vectors,prev2curr_matrix,origin,roi_size):
    # transform prev vectors
    prev2curr_vectors = dict()
    for label, vecs in prev_vectors.items():
        if len(vecs) > 0:
            vecs = np.stack(vecs, 0)
            vecs = torch.tensor(vecs)
            N, num_points, _ = vecs.shape
            denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2)
            denormed_vecs = torch.cat([
                denormed_vecs,
                denormed_vecs.new_zeros((N, num_points, 1)), # z-axis
                denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim
            ], dim=-1) # (num_prop, num_pts, 4)

            transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float()
            normed_vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2)
            normed_vecs = torch.clip(normed_vecs, min=0., max=1.)
            prev2curr_vectors[label] = normed_vecs
        else:
            prev2curr_vectors[label] = vecs

    # convert to ego space for visualization
    for label in prev2curr_vectors:
        if len(prev2curr_vectors[label]) > 0:
            prev2curr_vectors[label] = prev2curr_vectors[label] * roi_size + origin
        if len(curr_vectors[label]) > 0:
            curr_vecs = torch.tensor(np.stack(curr_vectors[label]))
            curr_vectors[label] = curr_vecs * roi_size + origin
        if len(prev_vectors[label]) > 0:
            prev_vecs = torch.tensor(np.stack(prev_vectors[label]))
            prev_vectors[label] = prev_vecs * roi_size + origin
    
    return prev_vectors, curr_vectors, prev2curr_vectors

def filter_vectors(data_info, origin,roi_size,thr,num_interp=20):
    ### filter vectors over threshold
    filtered_vectors = {label: [] for label in cat2id.values()}
    for i in range(len(data_info['labels'])):
        score = data_info['scores'][i]
        label = data_info['labels'][i]
        v = data_info['vectors'][i]
        if score > thr:
            interp_v = interp_fixed_num(v,num_interp)
            filtered_vectors[label].append( (np.array(interp_v) - origin)/roi_size )
    return filtered_vectors


================================================
FILE: tools/tracking/cmap_utils/utils.py
================================================
import cv2
from PIL import Image, ImageDraw
import os
import torch
import numpy as np
from shapely.geometry import LineString

def import_plugin(cfg):
    '''
        import modules from plguin/xx, registry will be update
    '''
    import sys
    sys.path.append(os.path.abspath('.'))    
    if hasattr(cfg, 'plugin'):
        if cfg.plugin:
            import importlib
            
            def import_path(plugin_dir):
                _module_dir = os.path.dirname(plugin_dir)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]
                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                print(_module_path)
                plg_lib = importlib.import_module(_module_path)

            plugin_dirs = cfg.plugin_dir
            if not isinstance(plugin_dirs, list):
                plugin_dirs = [plugin_dirs,]
            for plugin_dir in plugin_dirs:
                import_path(plugin_dir)

def draw_polylines(vecs, roi_size, origin, cfg):
    results = []
    for line_coords in vecs:
        canvas = np.zeros((cfg.canvas_size[1], cfg.canvas_size[0]), dtype=np.uint8)
        coords = (line_coords - origin) / roi_size * torch.tensor(cfg.canvas_size)
        coords = coords.numpy()
        cv2.polylines(canvas, np.int32([coords]), False, color=1, thickness=cfg.thickness)
        result = np.flipud(canvas)
        if result.sum() < 20:
            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7))
            result = cv2.dilate(result, kernel, iterations=1)
        results.append(result)
    return results

def draw_polygons(vecs, roi_size, origin, cfg):
    results = []
    for poly_coords in vecs:
        mask = Image.new("L", size=(cfg.canvas_size[0], cfg.canvas_size[1]), color=0)
        coords = (poly_coords - origin) / roi_size * torch.tensor(cfg.canvas_size)
        coords = coords.numpy()
        vert_list = [(x, y) for x, y in coords]
        if not (coords[0] == coords[-1]).all():
            vert_list.append(vert_list[0])
        ImageDraw.Draw(mask).polygon(vert_list, outline=1, fill=1)
        result = np.flipud(np.array(mask))
        if result.sum() < 20:
            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7))
            result = cv2.dilate(result, kernel, iterations=1)
        results.append(result)
    return results
    

def draw_instance_masks(vectors, roi_size, origin, cfg):
    masks = {}
    canvas = np.zeros((cfg.canvas_size[1], cfg.canvas_size[0]))
    for label, vecs in vectors.items():
        if label == 0:
            masks[label] = draw_polygons(vecs, roi_size, origin, cfg)
        else:
            masks[label] = draw_polylines(vecs, roi_size, origin, cfg)
        for mask in masks[label]:
            canvas += mask
    return masks, canvas


def interp_fixed_num(vector, num_pts):
    line = LineString(vector)

    distances = np.linspace(0, line.length, num_pts)
    sampled_points = np.array([list(line.interpolate(distance).coords) 
        for distance in distances]).squeeze()
    
    return sampled_points

def chamfer_distance_batch(pred_lines, gt_lines):

    _, num_pts, coord_dims = pred_lines.shape
    
    if not isinstance(pred_lines, torch.Tensor):
        pred_lines = torch.tensor(pred_lines)
    if not isinstance(gt_lines, torch.Tensor):
        gt_lines = torch.tensor(gt_lines)
    dist_mat = torch.cdist(pred_lines.view(-1, coord_dims), 
                    gt_lines.view(-1, coord_dims), p=2) 
    # (num_query*num_points, num_gt*num_points)
    dist_mat = torch.stack(torch.split(dist_mat, num_pts)) 
    # (num_query, num_points, num_gt*num_points)
    dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) 
    # (num_gt, num_q, num_pts, num_pts)

    dist1 = dist_mat.min(-1)[0].sum(-1)
    dist2 = dist_mat.min(-2)[0].sum(-1)

    dist_matrix = (dist1 + dist2).transpose(0, 1) / (2 * num_pts)
    
    return dist_matrix.numpy()

def average_precision(recalls, precisions, mode='area'):

    recalls = recalls[np.newaxis, :]
    precisions = precisions[np.newaxis, :]
    assert recalls.shape == precisions.shape and recalls.ndim == 2
    num_scales = recalls.shape[0]
    ap = 0.
    if mode == 'area':
        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
        mrec = np.hstack((zeros, recalls, ones))
        mpre = np.hstack((zeros, precisions, zeros))
        for i in range(mpre.shape[1] - 1, 0, -1):
            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
        
        ind = np.where(mrec[0, 1:] != mrec[0, :-1])[0]
        ap = np.sum(
            (mrec[0, ind + 1] - mrec[0, ind]) * mpre[0, ind + 1])
    
    elif mode == '11points':
        for thr in np.arange(0, 1 + 1e-3, 0.1):
            precs = precisions[0, recalls[i, :] >= thr]
            prec = precs.max() if precs.size > 0 else 0
            ap += prec
        ap /= 11
    else:
        raise ValueError(
            'Unrecognized mode, only "area" and "11points" are supported')

    return ap


================================================
FILE: tools/tracking/prepare_gt_tracks.py
================================================
import argparse
import mmcv
from mmcv import Config
import os
from mmdet3d.datasets import build_dataset, build_dataloader
import cv2
import torch
import numpy as np
from PIL import Image, ImageDraw
import copy
import imageio
from scipy.optimize import linear_sum_assignment
import pickle
from functools import partial
from multiprocessing import Pool


font                   = cv2.FONT_HERSHEY_SIMPLEX
location               = (200,60)
fontScale              = 2
fontColor              = (255,0,0)
thickness              = 2
lineType               = 2

N_WORKERS = 16

def parse_args():
    parser = argparse.ArgumentParser(
        description='Visualize groundtruth and results')
    parser.add_argument('config', help='config file path')
    parser.add_argument('--result', 
        default=None,
        help='prediction result to visualize'
        'If submission file is not provided, only gt will be visualized')
    parser.add_argument(
        '--out-dir', 
        default='demo',
        help='directory where visualize results will be saved')
    parser.add_argument(
        '--visualize', 
        action="store_true",
        default=False,
        help='whether visualize the formed gt tracks')
    args = parser.parse_args()

    return args

def import_plugin(cfg):
    '''
        import modules from plguin/xx, registry will be update
    '''

    import sys
    sys.path.append(os.path.abspath('.'))    
    if hasattr(cfg, 'plugin'):
        if cfg.plugin:
            import importlib
            
            def import_path(plugin_dir):
                _module_dir = os.path.dirname(plugin_dir)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]

                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                print(_module_path)
                plg_lib = importlib.import_module(_module_path)

            plugin_dirs = cfg.plugin_dir
            if not isinstance(plugin_dirs, list):
                plugin_dirs = [plugin_dirs,]
            for plugin_dir in plugin_dirs:
                import_path(plugin_dir)
                

def draw_polylines(vecs, roi_size, origin, cfg):
    results = []
    for line_coords in vecs:
        canvas = np.zeros((cfg.canvas_size[1], cfg.canvas_size[0]), dtype=np.uint8)
        coords = (line_coords - origin) / roi_size * torch.tensor(cfg.canvas_size)
        coords = coords.numpy()
        cv2.polylines(canvas, np.int32([coords]), False, color=1, thickness=cfg.thickness)
        result = np.flipud(canvas)
        if result.sum() < 20:
            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7))
            result = cv2.dilate(result, kernel, iterations=1)
        results.append(result)
    return results
        

def draw_polygons(vecs, roi_size, origin, cfg):
    results = []
    for poly_coords in vecs:
        mask = Image.new("L", size=(cfg.canvas_size[0], cfg.canvas_size[1]), color=0)
        coords = (poly_coords - origin) / roi_size * torch.tensor(cfg.canvas_size)
        coords = coords.numpy()
        vert_list = [(x, y) for x, y in coords]
        if not (coords[0] == coords[-1]).all():
            vert_list.append(vert_list[0])
        ImageDraw.Draw(mask).polygon(vert_list, outline=1, fill=1)
        result = np.flipud(np.array(mask))
        if result.sum() < 20:
            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7))
            result = cv2.dilate(result, kernel, iterations=1)
        results.append(result)
    return results
        

def draw_instance_masks(vectors, roi_size, origin, cfg):
    masks = {}
    for label, vecs in vectors.items():
        if label == 0:
            masks[label] = draw_polygons(vecs, roi_size, origin, cfg)
        else:
            masks[label] = draw_polylines(vecs, roi_size, origin, cfg)
    return masks


def _mask_iou(mask1, mask2):
    intersection = (mask1 * mask2).sum()
    if intersection == 0:
        return 0.0
    union = np.logical_or(mask1, mask2).sum()
    return intersection / union


def find_matchings(src_masks, tgt_masks, thresh=0.1):
    """Find the matching of map elements between two temporally 
    connected frame

    Args:
        src_masks (_type_): instance masks of prev frame
        tgt_masks (_type_): instance masks of current frame
        thresh (float, optional): IOU threshold for matching. Defaults to 0.1.
    """
    matchings = {}
    for label, src_instances in src_masks.items():
        tgt_instances = tgt_masks[label]
        cost = np.zeros([len(src_instances), len(tgt_instances)])
        for i, src_ins in enumerate(src_instances):
            for j, tgt_ins in enumerate(tgt_instances):
                iou = _mask_iou(src_ins, tgt_ins)
                cost[i, j] = -iou
        row_ind, col_ind = linear_sum_assignment(cost)
        
        label_matching = [-1 for _ in range(len(src_instances))]
        label_matching_reverse = [-1 for _ in range(len(tgt_instances))]
        for i, j in zip(row_ind, col_ind):
            if -cost[i, j] > thresh:
                label_matching[i] = j
                label_matching_reverse[j] = i
        
        matchings[label] = (label_matching, label_matching_reverse)
    return matchings
                
        
def match_two_consecutive_frames(prev_data, curr_data, roi_size, origin, cfg):
    # get relative pose
    prev_e2g_trans = torch.tensor(prev_data['img_metas'].data['ego2global_translation'], dtype=torch.float64)
    prev_e2g_rot = torch.tensor(prev_data['img_metas'].data['ego2global_rotation'], dtype=torch.float64)
    curr_e2g_trans  = torch.tensor(curr_data['img_metas'].data['ego2global_translation'], dtype=torch.float64)
    curr_e2g_rot = torch.tensor(curr_data['img_metas'].data['ego2global_rotation'], dtype=torch.float64)
    prev_e2g_matrix = torch.eye(4, dtype=torch.float64)
    prev_e2g_matrix[:3, :3] = prev_e2g_rot
    prev_e2g_matrix[:3, 3] = prev_e2g_trans

    curr_g2e_matrix = torch.eye(4, dtype=torch.float64)
    curr_g2e_matrix[:3, :3] = curr_e2g_rot.T
    curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans)

    prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix

    # get vector data
    prev_vectors = copy.deepcopy(prev_data['vectors'].data)
    curr_vectors = copy.deepcopy(curr_data['vectors'].data)

    #meta_info = curr_data['img_metas'].data
    #imgs = [mmcv.imread(i) for i in meta_info['img_filenames']]
    #cam_extrinsics = meta_info['cam_extrinsics']
    #cam_intrinsics = meta_info['cam_intrinsics']
    #ego2cams = meta_info['ego2cam']
    
    # transform prev vectors
    prev2curr_vectors = dict()
    for label, vecs in prev_vectors.items():
        if len(vecs) > 0:
            vecs = np.stack(vecs, 0)
            vecs = torch.tensor(vecs)
            N, num_points, _ = vecs.shape
            denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2)
            denormed_vecs = torch.cat([
                denormed_vecs,
                denormed_vecs.new_zeros((N, num_points, 1)), # z-axis
                denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim
            ], dim=-1) # (num_prop, num_pts, 4)

            transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float()
            normed_vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2)
            normed_vecs = torch.clip(normed_vecs, min=0., max=1.)
            prev2curr_vectors[label] = normed_vecs
        else:
            prev2curr_vectors[label] = vecs
    
    # convert to ego space for visualization
    for label in prev2curr_vectors:
        if len(prev2curr_vectors[label]) > 0:
            prev2curr_vectors[label] = prev2curr_vectors[label] * roi_size + origin
        if len(curr_vectors[label]) > 0:
            curr_vecs = torch.tensor(np.stack(curr_vectors[label]))
            curr_vectors[label] = curr_vecs * roi_size + origin
    
    prev2curr_masks = draw_instance_masks(prev2curr_vectors, roi_size, origin, cfg)
    curr_masks = draw_instance_masks(curr_vectors, roi_size, origin, cfg)
    
    prev2curr_matchings = find_matchings(prev2curr_masks, curr_masks, thresh=0.01)

    # For viz purpose, may display the maps in perspective images
    #viz_dir = os.path.join(scene_dir, '{}_viz_perspective'.format(local_idx))
    #if not os.path.exists(viz_dir):
    #    os.makedirs(viz_dir)
    #renderer.render_camera_views_from_vectors(curr_vectors, imgs, 
    #            cam_extrinsics, cam_intrinsics, ego2cams, 2, viz_dir)

    #renderer.render_bev_from_vectors(curr_vectors, out_dir=None, specified_path='cur.png')
    #renderer.render_bev_from_vectors(prev2curr_vectors, out_dir=None, specified_path='prev2cur.png')
    #from PIL import Image 
    #background = Image.open("cur.png")
    #overlay = Image.open("prev2cur.png")
    #background = background.convert("RGBA")
    #overlay = overlay.convert("RGBA")
    #new_img = Image.blend(background, overlay, 0.5)
    #new_img.save("cur_overlapped.png","PNG")
    #import pdb; pdb.set_trace()
    
    return prev2curr_matchings


def assign_global_ids(matchings_seq, vectors_seq):
    ids_seq = []
    global_map_index = {
        0: 0,
        1: 0,
        2: 0,
    }
    
    ids_0 = dict()
    for label, vectors in vectors_seq[0].items():
        id_mapping = dict()
        for i, _ in enumerate(vectors):
            id_mapping[i] = global_map_index[label]
            global_map_index[label] += 1
        ids_0[label] = id_mapping
    ids_seq.append(ids_0)

    # Trace all frames following the consecutive matching
    for t, vectors_t in enumerate(vectors_seq[1:]):
        ids_t = dict()
        for label, vectors in vectors_t.items():
            reverse_matching = matchings_seq[t][label][1]
            id_mapping = dict()
            for i, _ in enumerate(vectors):
                if reverse_matching[i] != -1:
                    prev_id = reverse_matching[i]
                    global_id = ids_seq[-1][label][prev_id]
                else:
                    global_id = global_map_index[label]
                    global_map_index[label] += 1
                id_mapping[i] = global_id
            ids_t[label] = id_mapping
        ids_seq.append(ids_t)
    return ids_seq


def _denorm(vectors, roi_size, origin):
    for label in vectors:
        for i, vec in enumerate(vectors[label]):
            vectors[label][i] = vec * roi_size + origin
    return vectors


def form_gt_track_single(scene_name, scene_name2idx, dataset, out_dir, cfg, args):
    print('Process scene {}'.format(scene_name))

    renderer = dataset.renderer

    roi_size = torch.tensor(cfg.roi_size)
    origin = torch.tensor(cfg.pc_range[:2])

    start_idx = scene_name2idx[scene_name][0]
    matchings_seq = []
    vectors_seq = []

    for idx in scene_name2idx[scene_name]:
        local_idx = idx - start_idx
        if idx == start_idx:
            prev_data = dataset[idx]
        if idx == scene_name2idx[scene_name][-1]: # prev_data is the last frame
            vectors_seq.append(prev_data['vectors'].data)
            break

        curr_data = dataset[idx+1]
        matchings = match_two_consecutive_frames(prev_data, curr_data, roi_size, origin, cfg)
        matchings_seq.append(matchings)
        vectors_seq.append(prev_data['vectors'].data)

        prev_data = curr_data
    
    # Derive global ids...
    # get global ids by traversing all consecutive matching results
    ids_info = assign_global_ids(matchings_seq, vectors_seq)

    matching_meta = {
        'sample_ids':scene_name2idx[scene_name],
        'instance_ids': ids_info,
    }

    if args.visualize:
        print('Visualize gt tracks for scene {}'.format(scene_name))
        scene_dir = os.path.join(out_dir, scene_name)
        os.makedirs(scene_dir, exist_ok=True)
        # visualize with matched track ids
        imgs = []
        for idx, (id_info, vectors) in enumerate(zip(ids_info, vectors_seq)):
            vectors = _denorm(vectors, roi_size.numpy(), origin.numpy())
            save_path = os.path.join(scene_dir, f'{idx}_with_id.png')
            renderer.render_bev_from_vectors(vectors, out_dir=None, specified_path=save_path, id_info=id_info)
            viz_img = np.ascontiguousarray(cv2.imread(save_path)[:, :, ::-1], dtype=np.uint8)
            if idx == 0:
                img_shape = (viz_img.shape[1], viz_img.shape[0])
            else:
                viz_img = cv2.resize(viz_img, img_shape)
            cv2.putText(viz_img, 't={}'.format(idx), location, font, fontScale, fontColor,
            thickness, lineType)
            imgs.append(viz_img)
        gif_path = os.path.join(scene_dir, 'matching.gif')
        imageio.mimsave(gif_path, imgs, duration=500)
    
    return scene_name, matching_meta
        
        
def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    import_plugin(cfg)

    for split in ['train', 'val']:
        if split == 'train' and split not in cfg.match_config.ann_file:
            cfg.match_config.ann_file = cfg.match_config.ann_file.replace('val', 'train')
        if split == 'val' and split not in cfg.match_config.ann_file:
            cfg.match_config.ann_file = cfg.match_config.ann_file.replace('train', 'val')

        # build the dataset
        dataset = build_dataset(cfg.match_config)

        scene_name2idx = {}
        for idx, sample in enumerate(dataset.samples):
            scene = sample['scene_name']
            if scene not in scene_name2idx:
                scene_name2idx[scene] = []
            scene_name2idx[scene].append(idx)
            
        all_scene_names = sorted(list(scene_name2idx.keys()))
        all_scene_matching_meta = {}

        out_dir = os.path.join(args.out_dir, split)
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        all_scene_infos = []
        for scene_idx, scene_name in enumerate(all_scene_names):
            all_scene_infos.append((scene_name,))
            
        if N_WORKERS > 0:
            fn = partial(form_gt_track_single, scene_name2idx=scene_name2idx,
                dataset=dataset, cfg=cfg, out_dir=out_dir, args=args)
            pool = Pool(N_WORKERS)
            matching_results = pool.starmap(fn, all_scene_infos)
            pool.close()
        else:
            matching_results =[]
            for scene_info in all_scene_infos:
                scene_name = scene_info[0]
                single_matching_result = form_gt_track_single(scene_name=scene_name, scene_name2idx=scene_name2idx,
                        dataset=dataset, cfg=cfg, out_dir=out_dir, args=args)
                matching_results.append(single_matching_result)
        
        for scene_name, matching_meta in matching_results:
            all_scene_matching_meta[scene_name] = matching_meta
        
        track_gt_path = cfg.match_config.ann_file[:-4] + '_gt_tracks.pkl'
        with open(track_gt_path, 'wb') as f:
            pickle.dump(all_scene_matching_meta, f, protocol=pickle.HIGHEST_PROTOCOL)

        
if __name__ == '__main__':
    main()


================================================
FILE: tools/tracking/prepare_pred_tracks.py
================================================
import argparse
import mmcv
from mmcv import Config
import os
from mmdet3d.datasets import build_dataset
import cv2
import torch
import numpy as np
import imageio
import pickle
from functools import partial
from multiprocessing import Pool
import time
from cmap_utils.utils import *
from cmap_utils.match_utils import get_prev2curr_matrix, find_matchings_iou, get_consecutive_vectors,filter_vectors

font                   = cv2.FONT_HERSHEY_SIMPLEX
location               = (200,60)
fontScale              = 2
fontColor              = (255,0,0)
thickness              = 2
lineType               = 2

cat2id = {
    'ped_crossing': 0,
    'divider': 1,
    'boundary': 2,
}

N_WORKERS = 10

def parse_args():
    parser = argparse.ArgumentParser(
        description='Visualize groundtruth and results')
    parser.add_argument('config', help='config file path')
    parser.add_argument('--thr', 
        type=float,
        default=0.4,
        help='score threshold to filter predictions')
    parser.add_argument(
        '--result_path',
        default=None,
        help='directory to submission file')
    parser.add_argument(
        '--cons_frames',
        default=5,
        type=int,
        help='consective frames for matchings'
    )
    parser.add_argument(
        '--visual',
        default=0,
        type=int,
        help='whether to visual'
    )
    args = parser.parse_args()
    return args

def match_two_consecutive_frames_pred(args,prev_data,prev_meta,  curr_data, curr_meta,roi_size, origin, cfg):

    prev2curr_matrix = get_prev2curr_matrix(prev_meta,curr_meta)

    prev_vectors = filter_vectors(prev_data,origin,roi_size,args.thr)
    curr_vectors = filter_vectors(curr_data,origin,roi_size,args.thr)

    prev_vectors, curr_vectors, prev2curr_vectors = get_consecutive_vectors(prev_vectors,curr_vectors,
                                    prev2curr_matrix,origin,roi_size) 

    prev2curr_masks, prev2curr_viz = draw_instance_masks(prev2curr_vectors, roi_size, origin, cfg)
    curr_masks, curr_viz = draw_instance_masks(curr_vectors, roi_size, origin, cfg)

    prev2curr_matchings = find_matchings_iou(prev2curr_masks, curr_masks, thresh=0.001)
    curr2prev_matchings = {label:[match_info[1],match_info[0]]  for label,match_info in prev2curr_matchings.items()}
    return curr2prev_matchings

def collect_pred(data,thr):
    vectors = {label: [] for label in cat2id.values()}
    scores = {label: [] for label in cat2id.values()}
    for i in range(len(data['labels'])):
        score, label, v = data['scores'][i], data['labels'][i], data['vectors'][i]
        if score > thr:
            vectors[label].append(np.array(v))
            scores[label].append(score)
    return vectors, scores

def get_scene_matching_result(args,cfg,pred_results,dataset,origin,roi_size,
                              scene_name2idx):
    ### obtain local id sequence matching results of predictions
    vectors_seq = []
    scores_seq = []

    ids_seq = []
    global_map_index = {
        0: 0,
        1: 0,
        2: 0,
    }
    frame_token_list = []
    pred_data_list = []
    meta_list = []

    for idx in scene_name2idx:
        token = dataset[idx]['img_metas'].data['token']
        pred_data = pred_results[token]
        frame_token_list.append(token)
        meta_list.append(dataset[idx]['img_metas'].data)
        pred_data_list.append(pred_data)

    for local_idx in range(len(frame_token_list)):
        curr_pred_data = pred_data_list[local_idx]
        vectors_info, scores = collect_pred(curr_pred_data,args.thr)
        vectors_seq.append(vectors_info)
        scores_seq.append(scores)

        ### assign global id for the first frame
        if local_idx == 0:
            ids_0 = dict()
            for label, vectors in vectors_info.items():
                id_mapping = dict()
                for i, _ in enumerate(vectors):
                    id_mapping[i] = global_map_index[label]
                    global_map_index[label] += 1
                ids_0[label] = id_mapping
            ids_seq.append(ids_0)
            continue

        ### from the farthest to the nearest
        history_range = range(max(local_idx-args.cons_frames,0),local_idx)
        tmp_ids_list = []
        for comeback_idx,prev_idx in enumerate(history_range):

            tmp_ids = {label:{} for label in cat2id.values()} 
            curr_pred_data = pred_data_list[local_idx]
            comeback_pred_data = pred_data_list[prev_idx]
            curr_meta = meta_list[local_idx]
            comeback_meta = meta_list[prev_idx]

            curr2prev_matching = match_two_consecutive_frames_pred(args,comeback_pred_data,comeback_meta,
                                            curr_pred_data, curr_meta,roi_size, origin, cfg)
            
            for label,match_info in curr2prev_matching.items():
                for curr_match_local_idx,prev_match_local_idx in enumerate(match_info[0]):
                    if prev_match_local_idx == -1:
                        tmp_ids[label][curr_match_local_idx] = -1
                    else:
                        prev_match_global_idx = ids_seq[prev_idx][label][prev_match_local_idx]
                        tmp_ids[label][curr_match_local_idx] = prev_match_global_idx

            tmp_ids_list.append(tmp_ids)

        ids_n = {label:{} for label in cat2id.values()}

        ### assign global id based on previous k frames' global id
        missing_matchings = {label:[] for label in cat2id.values()}
        for tmp_match in tmp_ids_list[::-1]:
            for label, matching in tmp_match.items():
                for vec_local_idx, vec_glb_idx in matching.items():
                    if vec_local_idx not in ids_n[label].keys():
                        if vec_glb_idx != -1 and vec_glb_idx not in ids_n[label].values():
                            ids_n[label][vec_local_idx] = vec_glb_idx
                            if vec_local_idx in missing_matchings[label]:
                                missing_matchings[label].remove(vec_local_idx)
                        else:
                            missing_matchings[label].append(vec_local_idx)

        ### assign new id if one vector is not matched 
        for label,miss_match in missing_matchings.items():
            for miss_idx in miss_match:
                if miss_idx not in ids_n[label].keys():
                    ids_n[label][miss_idx] = global_map_index[label]
                    global_map_index[label] += 1
        ids_seq.append(ids_n)

    return ids_seq, vectors_seq, scores_seq, meta_list

def generate_results(ids_info,vectors_seq,scores_seq,meta_list,scene_name):
    ### assign global id 

    global_gt_idx = {}
    result_list = []
    instance_count = 0
    for f_idx in range(len(ids_info)):
        output_dict = {'vectors':[],'global_ids':[],'labels':[],'scores':[],'local_idx':[]}
        output_dict['scene_name'] = scene_name
        output_dict['meta'] = meta_list[f_idx]
        for label in cat2id.values():
            for local_idx, global_label_idx in ids_info[f_idx][label].items():
                overall_count_idx = label*100 + global_label_idx
                if overall_count_idx not in global_gt_idx.keys():
                    overall_global_idx = instance_count
                    global_gt_idx[overall_count_idx] = overall_global_idx
                    instance_count += 1
                else:
                    overall_global_idx = global_gt_idx[overall_count_idx]
                output_dict['global_ids'].append(overall_global_idx)
                output_dict['vectors'].append(vectors_seq[f_idx][label][local_idx])
                output_dict['scores'].append(scores_seq[f_idx][label][local_idx])
                output_dict['labels'].append(label)
        output_dict['local_idx'] = f_idx

        result_list.append(output_dict)
    return result_list

def get_matching_single(scene_name,args,scene_name2idx,dataset,cfg,pred_results,origin,roi_size):
    name2idx = scene_name2idx[scene_name]
    ids_info, vectors_seq,scores_seq,meta_list = get_scene_matching_result(args,cfg,pred_results,dataset,
            origin,roi_size,name2idx)
    gen_result = generate_results(ids_info,vectors_seq,scores_seq,meta_list,scene_name)

    return (scene_name,ids_info,gen_result)


def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    import_plugin(cfg)
    dataset = build_dataset(cfg.match_config)

    scene_name2idx = {}
    scene_name2token = {}
    for idx, sample in enumerate(dataset.samples):
        scene = sample['scene_name']
        token = sample['token']
        if scene not in scene_name2idx:
            scene_name2idx[scene] = []
            scene_name2token[scene] = []
        scene_name2idx[scene].append(idx)

    submission = mmcv.load(args.result_path)
    results = submission['results']

    all_scene_names = sorted(list(scene_name2idx.keys()))
    all_scene_matching_meta = {}

    scene_info_list = []

    for single_scene_name in all_scene_names:
        scene_info_list.append( (single_scene_name,args) )

    roi_size = torch.tensor(cfg.roi_size).numpy()
    origin = torch.tensor(cfg.pc_range[:2]).numpy()

    start_time = time.time()

    if N_WORKERS > 0:
        fn = partial(get_matching_single, scene_name2idx=scene_name2idx,dataset=dataset,cfg=cfg,
                    pred_results=results,origin=origin,roi_size=roi_size)
        pool = Pool(N_WORKERS)
        matching_results = pool.starmap(fn,scene_info_list)
        pool.close()
    else:
        matching_results =[]
        for scene_info in scene_info_list:
            scene_name = scene_info[0]
            single_matching_result = get_matching_single(scene_name=scene_name, scene_name2idx=scene_name2idx,
                    args=args,  dataset=dataset,cfg=cfg,pred_results=results,origin=origin,roi_size=roi_size)
            matching_results.append(single_matching_result)

    final_reuslt = []
    for single_matching_info in matching_results:
        scene_name = single_matching_info[0]
        single_matching = single_matching_info[1]
        all_scene_matching_meta[scene_name] = single_matching
        final_reuslt.extend(single_matching_info[2])

    meta_path = args.result_path.replace('submission_vector.json','pos_predictions_{}.pkl'.format(args.cons_frames))
    with open(meta_path, 'wb') as f:
        pickle.dump(final_reuslt, f, protocol=pickle.HIGHEST_PROTOCOL)
    print('Matching Time',time.time()-start_time)


if __name__ == '__main__':
    main()

================================================
FILE: tools/train.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
 
from __future__ import division

import argparse
import copy
import mmcv
import os
import time
import torch
import warnings
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model
from os import path as osp

from mmdet import __version__ as mmdet_version
from mmdet3d import __version__ as mmdet3d_version
from mmdet3d.apis import train_model
from mmdet3d.datasets import build_dataset
from mmdet3d.models import build_model
from mmdet3d.utils import collect_env, get_root_logger
from mmdet.apis import set_random_seed
from mmseg import __version__ as mmseg_version
from mmcv.utils import TORCH_VERSION, digit_version

def parse_args():
    parser = argparse.ArgumentParser(description='Train a detector')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--no-validate',
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument(
        '--gpus',
        type=int,
        help='number of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed training)')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file (deprecate), '
        'change to --cfg-options instead.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    parser.add_argument(
        '--autoscale-lr',
        action='store_true',
        help='automatically scale lr with the number of gpus')
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.cfg_options:
        raise ValueError(
            '--options and --cfg-options cannot be both specified, '
            '--options is deprecated in favor of --cfg-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --cfg-options')
        args.cfg_options = args.options

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])

    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # import modules from plguin/xx, registry will be updated
    import sys
    sys.path.append(os.path.abspath('.'))    
    if hasattr(cfg, 'plugin'):
        if cfg.plugin:
            import importlib
            if hasattr(cfg, 'plugin_dir'):
                def import_path(plugin_dir):
                    _module_dir = os.path.dirname(plugin_dir)
                    _module_dir = _module_dir.split('/')
                    _module_path = _module_dir[0]

                    for m in _module_dir[1:]:
                        _module_path = _module_path + '.' + m
                    print(_module_path)
                    plg_lib = importlib.import_module(_module_path)

                plugin_dirs = cfg.plugin_dir
                if not isinstance(plugin_dirs,list):
                    plugin_dirs = [plugin_dirs,]
                for plugin_dir in plugin_dirs:
                    import_path(plugin_dir)
                
            else:
                # import dir is the dirpath for the config file
                _module_dir = os.path.dirname(args.config)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]
                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                print(_module_path)
                plg_lib = importlib.import_module(_module_path)

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids
    else:
        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
    if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
        cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
    if args.autoscale_lr:
        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        # re-set gpu_ids with distributed training mode
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)

    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # dump config
    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
    # specify logger name, if we still use 'mmdet', the output info will be
    # filtered and won't be saved in the log_file
    # TODO: ugly workaround to judge whether we are training det or seg model
    if cfg.model.type in ['EncoderDecoder3D']:
        logger_name = 'mmseg'
    else:
        logger_name = 'mmdet'
    logger = get_root_logger(
        log_file=log_file, log_level=cfg.log_level, name=logger_name)

    # init the meta dict to record some important information such as
    # environment info and seed, which will be logged
    meta = dict()
    # log env info
    env_info_dict = collect_env()
    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                dash_line)
    meta['env_info'] = env_info
    meta['config'] = cfg.pretty_text

    # log some basic info
    logger.info(f'Distributed training: {distributed}')
    logger.info(f'Config:\n{cfg.pretty_text}')

    # set random seeds
    if args.seed is not None:
        logger.info(f'Set random seed to {args.seed}, '
                    f'deterministic: {args.deterministic}')
        set_random_seed(args.seed, deterministic=args.deterministic)
    cfg.seed = args.seed
    meta['seed'] = args.seed
    meta['exp_name'] = osp.basename(args.config)

    model = build_model(
        cfg.model,
        train_cfg=cfg.get('train_cfg'),
        test_cfg=cfg.get('test_cfg'))

    model.init_weights()

    if cfg.get('SyncBN', False):
        import torch.nn as nn
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        logger.info("Using SyncBN")
        
    logger.info(f'Model:\n{model}')
    cfg.data.train.work_dir = cfg.work_dir
    cfg.data.val.work_dir = cfg.work_dir
    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        val_dataset = copy.deepcopy(cfg.data.val)
        # in case we use a dataset wrapper
        if 'dataset' in cfg.data.train:
            val_dataset.pipeline = cfg.data.train.dataset.pipeline
        else:
            val_dataset.pipeline = cfg.data.train.pipeline
        # set test_mode=False here in deep copied config
        # which do not affect AP/AR calculation later
        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
        val_dataset.test_mode = False
        datasets.append(build_dataset(val_dataset))
    if cfg.checkpoint_config is not None:
        # save mmdet version, config file content and class names in
        # checkpoints as meta data
        cfg.checkpoint_config.meta = dict(
            mmdet_version=mmdet_version,
            mmseg_version=mmseg_version,
            mmdet3d_version=mmdet3d_version,
            config=cfg.pretty_text,
            CLASSES=None,
            PALETTE=datasets[0].PALETTE  # for segmentors
            if hasattr(datasets[0], 'PALETTE') else None)
    # add an attribute for visualization convenience
    # model.CLASSES = datasets[0].CLASSES
    from plugin.core.apis import custom_train_model
    custom_train_model(
        model,
        datasets,
        cfg,
        distributed=distributed,
        validate=(not args.no_validate),
        timestamp=timestamp,
        meta=meta)


if __name__ == '__main__':
    main()


================================================
FILE: tools/visualization/vis_global.py
================================================
import sys
import os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.dirname(SCRIPT_DIR))

import argparse     
import mmcv
from mmcv import Config
import matplotlib.transforms as transforms
from mmdet3d.datasets import build_dataset
import cv2
import torch
import numpy as np
from PIL import Image
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
from shapely.geometry import LineString, Point
from shapely.ops import nearest_points
from scipy.spatial import ConvexHull
from PIL import Image
import cv2
import imageio
import math
from tracking.cmap_utils.match_utils import *


def parse_args():
    parser = argparse.ArgumentParser(
        description='Visualize groundtruth and results')
    parser.add_argument('config', help='config file path')
    parser.add_argument(
        '--out_dir',
        required=True,
        default="",
        help='')
    parser.add_argument(
        '--data_path',
        required=True,
        default="",
        help='Directory to submission file')
    parser.add_argument(
        '--scene_id',
        type=str, 
        nargs='+',
        default=None,
        help='Specify the scene_id to visulize')
    parser.add_argument(
        '--option',
        required=True,
        default="vis-pred",
        help='vis-pred, vis-gt')
    parser.add_argument(
        '--simplify',
        default=0.5,
        type=float,
        help='Line simplification tolerance'
    )
    parser.add_argument(
        '--line_opacity',
        default=0.75,
        type=float,
        help='Line opacity'
    )
    parser.add_argument(
        '--overwrite',
        default=1,
        type=int,
        help='Whether to overwrite the existing visualization files'
    )
    parser.add_argument(
        '--per_frame_result',
        default=1,
        type=int,
        help='Whether to visualize per frame result'
    )
    parser.add_argument(
        '--dpi',
        default=20,
        type=int,
        help='DPI of the output image'
    )
    parser.add_argument(
        '--transparent',
        default=False,
        action='store_true',
        help='Whether to use transparent background'
    )
    
    args = parser.parse_args()

    return args

def combine_images_with_labels(image_paths, labels, output_path, font_scale=0.5, font_color=(0, 0, 0)):
    # Load images
    images = [cv2.imread(path) for path in image_paths]
    
    # Determine the maximum dimensions
    max_height = max(image.shape[0] for image in images)
    max_width = max(image.shape[1] for image in images)
    
    # Create a blank white canvas to hold the 2x2 grid of images
    final_image = np.ones((max_height * 1, max_width * 2, 3), dtype=np.uint8) * 255
    
    # Font settings
    font = cv2.FONT_HERSHEY_SIMPLEX
    
    for i, img in enumerate(images):
        # Resize image if necessary
        img = cv2.resize(img, (max_width, max_height))
        
        # Calculate position for each image
        x_offset = (i % 2) * max_width
        y_offset = (i // 2) * max_height
        
        # Place image in the canvas
        final_image[y_offset:y_offset+max_height, x_offset:x_offset+max_width] = img
        
        # Add label
        cv2.putText(final_image, labels[i], (x_offset + 5, y_offset + 15), font, font_scale, font_color, 1, cv2.LINE_AA)
    
    # Save the final image
    cv2.imwrite(output_path, final_image)


def merge_corssing(polylines):
    convex_hull_polygon = find_largest_convex_hull(polylines)
    return convex_hull_polygon


def find_largest_convex_hull(polylines):
    # Merge all points from the polylines into a single collection
    all_points = []
    for polyline in polylines:
        all_points.extend(list(polyline.coords))
    
    # Convert the points to a NumPy array for processing with scipy
    points_array = np.array(all_points)
    
    # Compute the convex hull using scipy
    hull = ConvexHull(points_array)
    
    # Extract the vertices of the convex hull
    hull_points = points_array[hull.vertices]
    
    # Create a shapely Polygon object representing the convex hull
    convex_hull_polygon = LineString(hull_points).convex_hull
    
    return convex_hull_polygon


def project_point_onto_line(point, line):
    """Project a point onto a line segment and return the projected point."""
    line_start, line_end = np.array(line.coords[0]), np.array(line.coords[1])
    line_vec = line_end - line_start
    point_vec = np.array(point.coords[0]) - line_start
    line_len = np.linalg.norm(line_vec)
    line_unitvec = line_vec / line_len
    point_vec_scaled = point_vec / line_len
    t = np.dot(line_unitvec, point_vec_scaled)    
    t = np.clip(t, 0.0, 1.0)
    nearest = line_start + t * line_vec
    return Point(nearest)


def find_nearest_projection_on_polyline(point, polyline):
    """Find the nearest projected point of a point onto a polyline."""
    min_dist = float('inf')
    nearest_point = None
    for i in range(len(polyline.coords) - 1):
        segment = LineString(polyline.coords[i:i+2])
        proj_point = project_point_onto_line(point, segment)
        dist = point.distance(proj_point)
        if dist < min_dist:
            min_dist = dist
            nearest_point = proj_point
    return np.array(nearest_point.coords)


def find_and_sort_intersections(segmenet1, segment2):
    # Convert polylines to LineString objects

    # Find the intersection between the two LineStrings
    intersection = segmenet1.intersection(segment2)

    # Prepare a list to store intersection points
    intersections = []

    # Check the type of intersection
    if "Point" in intersection.geom_type:
        # Single point or multiple points
        if intersection.geom_type == "MultiPoint":
            intersections.extend(list(intersection))
        else:
            intersections.append(intersection)
    elif "LineString" in intersection.geom_type:
        # In case of lines or multiline, get boundary points (start and end points of line segments)
        if intersection.geom_type == "MultiLineString":
            for line in intersection:
                intersections.extend(list(line.boundary))
        else:
            intersections.extend(list(intersection.boundary))

    # Remove duplicates and ensure they are Point objects
    unique_intersections = [Point(coords) for coords in set(pt.coords[0] for pt in intersections)]

    # Sort the intersection points by their distance along the first polyline
    sorted_intersections = sorted(unique_intersections, key=lambda pt: segmenet1.project(pt))

    return sorted_intersections


def get_intersection_point_on_line(line, intersection):
    intersection_points  = find_and_sort_intersections(LineString(line), intersection)
    if len(intersection_points) >= 2:
        line_intersect_start = intersection_points[0]
        line_intersect_end = intersection_points[-1]
    elif len(intersection_points) == 1:
        if intersection.contains(Point(line[0])):
            line_intersect_start = Point(line[0])
            line_intersect_end = intersection_points[0]
        elif intersection.contains(Point(line[-1])):
            line_intersect_start = Point(line[-1])
            line_intersect_end = intersection_points[0]
        else:
            return None, None            
    else:
        return None, None            
    return line_intersect_start, line_intersect_end

def merge_l2_points_to_l1(line1, line2, line2_intersect_start, line2_intersect_end):
    # get nearest point on line2 to line2_intersect_start
    line2_point_to_merge = []
    line2_intersect_start_dis = line2.project(line2_intersect_start)
    line2_intersect_end_dis = line2.project(line2_intersect_end)
    for point in np.array(line2.coords):
        point_geom = Point(point)
        dis = line2.project(point_geom)
        if dis > line2_intersect_start_dis and dis < line2_intersect_end_dis:
            line2_point_to_merge.append(point)
            
    # merged the points
    merged_line2_points = []
    for point in line2_point_to_merge:
        # Use the `project` method to find the distance along the polyline to the closest point
        point_geom = Point(point)
        # Use the `interpolate` method to find the actual point on the polyline
        closest_point_on_line = find_nearest_projection_on_polyline(point_geom, line1)
        if len(closest_point_on_line) == 0:
            merged_line2_points.append(point)
        else:
            merged_line2_points.append(((closest_point_on_line + point) / 2)[0])

    if len(merged_line2_points) == 0:
        merged_line2_points = np.array([]).reshape(0, 2)
    else:
        merged_line2_points = np.array(merged_line2_points)
        
    return merged_line2_points        

def segment_line_based_on_merged_area(line, merged_points):
    
    if len(merged_points) == 0:
        return  np.array(line.coords),  np.array([]).reshape(0, 2)
    
    first_merged_point = merged_points[0]
    last_merged_point = merged_points[-1]
    
    start_dis = line.project(Point(first_merged_point))
    end_dis = line.project(Point(last_merged_point))
    
    start_segmenet = []
    for point in np.array(line.coords):
        point_geom = Point(point)
        if line.project(point_geom) < start_dis:
            start_segmenet.append(point)
    
    end_segmenet = []
    for point in np.array(line.coords):
        point_geom = Point(point)
        if line.project(point_geom) > end_dis:
            end_segmenet.append(point)
            
    if len(start_segmenet) == 0:
        start_segmenet = np.array([]).reshape(0, 2)
    else:
        start_segmenet = np.array(start_segmenet)
        
    if len(end_segmenet) == 0:
        end_segmenet = np.array([]).reshape(0, 2)
    else:
        end_segmenet = np.array(end_segmenet)
    
    return start_segmenet, end_segmenet
    
def get_bbox_size_for_points(points):
    if len(points) == 0:
        return 0, 0
    
    # Initialize min and max coordinates with the first point
    min_x, min_y = points[0]
    max_x, max_y = points[0]

    # Iterate through each point to update min and max coordinates
    for x, y in points[1:]:
        min_x = min(min_x, x)
        min_y = min(min_y, y)
        max_x = max(max_x, x)
        max_y = max(max_y, y)
    return max_x - min_x, max_y - min_y

def get_longer_segmenent_to_merged_points(l1_segment, l2_segment, merged_line2_points, segment_type="start"):
    # remove points from segments if it's too close to merged_line2_points
    l1_segment_temp = []
    if len(merged_line2_points) > 1:
        merged_polyline = LineString(merged_line2_points)
        for point in l1_segment:
            if merged_polyline.distance(Point(point)) > 0.1:
                l1_segment_temp.append(point)
    elif len(merged_line2_points) == 1:
        for point in l1_segment:
            if Point(point).distance(Point(merged_line2_points[0])) > 0.1:
                l1_segment_temp.append(point)
    elif len(merged_line2_points) == 0:
        l1_segment_temp = l1_segment
        
                
    l1_segment = np.array(l1_segment_temp)
    
    l2_segmenet_temp = []
    if len(merged_line2_points) > 1:
        merged_polyline = LineString(merged_line2_points)
        for point in l2_segment:
            if merged_polyline.distance(Point(point)) > 0.1:
                l2_segmenet_temp.append(point)
    elif len(merged_line2_points) == 1:
        for point in l2_segment:
            if Point(point).distance(Point(merged_line2_points[0])) > 0.1:
                l2_segmenet_temp.append(point)
    elif len(merged_line2_points) == 0:
        l2_segmenet_temp = l2_segment
                
    l2_segment = np.array(l2_segmenet_temp)
    
    if segment_type == "start":
        
        temp = l1_segment.tolist()
        if len(merged_line2_points) > 0:
            temp.append(merged_line2_points[0])
        
        l1_start_box_size = get_bbox_size_for_points(temp)
        
        temp = l2_segment.tolist()
        if len(merged_line2_points) > 0:
            temp.append(merged_line2_points[0])
        l2_start_box_size = get_bbox_size_for_points(temp)
    
        if l2_start_box_size[0]*l2_start_box_size[1] >= l1_start_box_size[0]*l1_start_box_size[1]:
            longer_segment = l2_segment
        else:
            longer_segment = l1_segment
    else:
        temp = l1_segment.tolist()
        if len(merged_line2_points) > 0:
            temp.append(merged_line2_points[-1])
        l1_end_box_size = get_bbox_size_for_points(temp)
        
        temp = l2_segment.tolist()
        if len(merged_line2_points) > 0:
            temp.append(merged_line2_points[-1])
        l2_end_box_size = get_bbox_size_for_points(temp)
    
        if l2_end_box_size[0]*l2_end_box_size[1] >= l1_end_box_size[0]*l1_end_box_size[1]:
            longer_segment = l2_segment
        else:
            longer_segment = l1_segment
    
    if len(longer_segment) == 0:
        longer_segment = np.array([]).reshape(0, 2)
    else:
        longer_segment = np.array(longer_segment)
        
    return longer_segment
    
def get_line_lineList_max_intersection(merged_lines, line, thickness=4):
    pre_line = merged_lines[-1]
    max_iou = 0
    merged_line_index = 0
    for line_index, one_merged_line in enumerate(merged_lines):
        line1 = LineString(one_merged_line)
        line2 = LineString(line)
        thick_line1 = line1.buffer(thickness)
        thick_line2 = line2.buffer(thickness)
        intersection = thick_line1.intersection(thick_line2)
        if intersection.area / thick_line2.area > max_iou:
            max_iou = intersection.area / thick_line2.area
            pre_line = np.array(line1.coords)
            merged_line_index = line_index
    return intersection, pre_line, merged_line_index
    
def algin_l2_with_l1(line1, line2):
    
    if len(line1) > len(line2):
        l2_len = len(line2)
        line1_geom = LineString(line1)
        interval_length = line1_geom.length / (l2_len - 1)
        line1 = [np.array(line1_geom.interpolate(interval_length * i)) for i in range(l2_len)]
        
    elif len(line1) < len(line2):
        l1_len = len(line1)
        line2_geom = LineString(line2)
        interval_length = line2_geom.length / (l1_len - 1)
        line2 = [np.array(line2_geom.interpolate(interval_length * i)) for i in range(l1_len)]
    
    # make line1 and line2 same direction, pre_line.coords[0] shold be closer to line2.coords[0]
    line1_geom = LineString(line1)
    line2_flip = np.flip(line2, axis=0)
    
    line2_traj_len = 0
    for point_idx, point in enumerate(line2):
        line2_traj_len += np.linalg.norm(point - line1[point_idx])
    
    flip_line2_traj_len = 0
    for point_idx, point in enumerate(line2_flip):
        flip_line2_traj_len += np.linalg.norm(point - line1[point_idx])
    
        
    if abs(flip_line2_traj_len - line2_traj_len) < 3:
        # get the trajectory length
        line2_walk_len = 0
        for point in line2:
            point_geom = Point(point)
            proj_point = find_nearest_projection_on_polyline(point_geom, line1_geom)
            if len(proj_point) != 0:
                line2_walk_len += line1_geom.project(Point(proj_point[0]))
        
        flip_line2_walk_len = 0
        for point in line2:
            point_geom = Point(point)
            proj_point = find_nearest_projection_on_polyline(point_geom, line1_geom)
            if len(proj_point) != 0:
                flip_line2_walk_len += line1_geom.project(Point(proj_point[0]))
        
        if flip_line2_walk_len < line2_walk_len:
            return line2_flip
        else:
            return line2
        
    
    if flip_line2_traj_len < line2_traj_len:
        return line2_flip
    else:
        return line2

def _is_u_shape(line, direction):
    assert direction in ['left', 'right'], 'Wrong direction argument {}'.format(direction)
    line_geom = LineString(line)
    length = line_geom.length
    mid_point = np.array(line_geom.interpolate(length / 2).coords)[0]
    start = line[0]
    end = line[-1]

    if direction == 'left':
        cond1 = mid_point[0] < start[0] and mid_point[0] < end[0]
    else:
        cond1 = mid_point[0] > start[0] and mid_point[0] > end[0]
    
    dist_start_end = np.sqrt((start[0] - end[0])**2 + (start[1]-end[1])**2)
    cond2 = length >= math.pi / 2 * dist_start_end

    return cond1 and cond2

def check_circle(pre_line, vec):

    # if the last line in merged_lines is a circle
    if np.linalg.norm(pre_line[0] - pre_line[-1]) == 0:
        return True
    
    # if the last line in merged_lines is almost a circle and the new line is close to the circle
    if np.linalg.norm(pre_line[0] - pre_line[-1]) < 0.1:
        vec_2_circle_distance = 0
        for point in vec:
            vec_2_circle_distance += LineString(pre_line).distance(Point(point))
        if vec_2_circle_distance < 3:
            return True
    return False
        
def connect_polygon(merged_polyline, merged_lines):
    start_end_connect = [merged_polyline[0], merged_polyline[-1]]
    iou = []
    length_ratio = []
    for one_merged_line in merged_lines:
        line1 = LineString(one_merged_line)
        line2 = LineString(start_end_connect)
        thickness = 1
        thick_line1 = line1.buffer(thickness)
        thick_line2 = line2.buffer(thickness)
        intersection = thick_line1.intersection(thick_line2)
        iou.append(intersection.area / thick_line2.area)
        length_ratio.append(line1.length / line2.length)

    if max(iou) > 0.95 and max(length_ratio) > 3.0:
        merged_polyline = np.concatenate((merged_polyline, [merged_polyline[0]]), axis=0)
    return merged_polyline
    
def iou_merge_boundry(merged_lines, vec, thickness=1):

    # intersection : the intersection area between the new line and the line in the merged_lines; is a polygon
    intersection, pre_line, merged_line_index = get_line_lineList_max_intersection(merged_lines, vec, thickness)

    # corner case: check if the last line in merged_lines is a circle
    if check_circle(pre_line, vec):
        return merged_lines

    # Handle U-shape, the main corner case
    if _is_u_shape(pre_line, 'left'):
        if _is_u_shape(vec, 'right'):
            # Two u shapes with opposite directions, directly generate a polygon exterior
            polygon = find_largest_convex_hull([LineString(pre_line), LineString(vec)])
            merged_lines[-1] = np.array(polygon.exterior.coords)
            return merged_lines
        elif not _is_u_shape(vec, 'left'):
            line_geom1 = LineString(pre_line)
            line1_dists = np.array([line_geom1.project(Point(x)) for x in pre_line])
            split_mask = line1_dists > line_geom1.length / 2
            split_1 = LineString(pre_line[~split_mask])
            split_2 = LineString(pre_line[split_mask])

            # get the projected distance
            np1 = np.array(nearest_points(split_1, Point(Point(pre_line[-1])))[0].coords)[0]
            np2 = np.array(nearest_points(split_2, Point(Point(pre_line[0])))[0].coords)[0]
            dist1 = np.linalg.norm(np1-pre_line[-1])
            dist2 = np.linalg.norm(np2-pre_line[0])
            dist = min(dist1, dist2)

            if dist < thickness:
                line_geom2 = LineString(vec)
                dist1 = line_geom2.distance(Point(pre_line[0]))
                dist2 = line_geom2.distance(Point(pre_line[-1]))
                pt = pre_line[0] if dist1 <= dist2 else pre_line[-1]
                if vec[0][0] > vec[1][0]:
                    vec = np.array(vec[::-1])
                    line_geom2 = LineString(vec)
                proj_length = line_geom2.project(Point(pt))
                l2_select_mask = np.array([line_geom2.project(Point(x)) > proj_length for x in vec])
                selected_l2 = vec[l2_select_mask]
                merged_result = np.concatenate([pre_line[:-1, :], pt[None, ...], selected_l2], axis=0)
                merged_lines[-1] = merged_result
                return merged_lines
    
    # align the new line with the line in the merged_lines so that points on two lines are traversed in the same direction
    vec = algin_l2_with_l1(pre_line, vec)
    line1 = LineString(pre_line)
    line2 = LineString(vec)
    
    # get the intersection points between IOU area and two lines
    line1_intersect_start, line1_intersect_end = get_intersection_point_on_line(pre_line, intersection)
    line2_intersect_start, line2_intersect_end = get_intersection_point_on_line(vec, intersection)
    
    # If no intersection points are found, use the last point of the line1 and the first point of the line2 as the intersection points --> this is a corner case that we will connect the two lines head to tail directly
    if line1_intersect_start is None or line1_intersect_end is None or line2_intersect_start is None or line2_intersect_end is None:
        line1_intersect_start = Point(pre_line[-1])
        line1_intersect_end = Point(pre_line[-1])
        line2_intersect_start = Point(vec[0])
        line2_intersect_end = Point(vec[0])
    
    # merge the points on line2's intersection area towards line1
    merged_line2_points = merge_l2_points_to_l1(line1, line2, line2_intersect_start, line2_intersect_end)
    # merge the points on line1's intersection area towards line2
    merged_line1_points = merge_l2_points_to_l1(line2, line1, line1_intersect_start, line1_intersect_end)
    
    # segment the lines based on the merged points (intersection area); split the line in to start segment and merged segment and end segment
    l2_start_segment, l2_end_segment = segment_line_based_on_merged_area(line2, merged_line2_points)
    l1_start_segment, l1_end_segment = segment_line_based_on_merged_area(line1, merged_line1_points)
    
    # choose the longer segment between line1 and line2 to be the final start segment and end segment
    start_segment = get_longer_segmenent_to_merged_points(l1_start_segment, l2_start_segment, merged_line2_points, segment_type="start")
    end_segment = get_longer_segmenent_to_merged_points(l1_end_segment, l2_end_segment, merged_line2_points, segment_type="end")
    merged_polyline = np.concatenate((start_segment, merged_line2_points, end_segment), axis=0)
    
    # corner case : check if need to connect the polyline to form a circle
    merged_polyline = connect_polygon(merged_polyline, merged_lines)
    
    merged_lines[merged_line_index] = merged_polyline
  
    return merged_lines

def iou_merge_divider(merged_lines, vec, thickness=1):
    # intersection : the intersection area between the new line and the line in the merged_lines; is a polygon
    # pre_line : the line in merged_lines that has max IOU with the new line
    intersection, pre_line, merged_line_index = get_line_lineList_max_intersection(merged_lines, vec, thickness)
    # align the new line with the line in the merged_lines so that points on two lines are traversed in the same direction
    vec = algin_l2_with_l1(pre_line, vec)
    
    line1 = LineString(pre_line)
    line2 = LineString(vec)
    
    # get the intersection points between IOU area and two lines
    line1_intersect_start, line1_intersect_end = get_intersection_point_on_line(pre_line, intersection)
    line2_intersect_start, line2_intersect_end = get_intersection_point_on_line(vec, intersection)
    
    # If no intersection points are found, use the last point of the line1 and the first point of the line2 as the intersection points --> this is a corner case that we will connect the two lines head to tail directly
    if line1_intersect_start is None or line1_intersect_end is None or line2_intersect_start is None or line2_intersect_end is None:
        line1_intersect_start = Point(pre_line[-1])
        line1_intersect_end = Point(pre_line[-1])
        line2_intersect_start = Point(vec[0])
        line2_intersect_end = Point(vec[0])
    
    # merge the points on line2's intersection area towards line1
    merged_line2_points = merge_l2_points_to_l1(line1, line2, line2_intersect_start, line2_intersect_end)
    # merge the points on line1's intersection area towards line2
    merged_line1_points = merge_l2_points_to_l1(line2, line1, line1_intersect_start, line1_intersect_end)
    
    # segment the lines based on the merged points (intersection area); split the line in to start segment and merged segment and end segment
    l2_start_segment, l2_end_segment = segment_line_based_on_merged_area(line2, merged_line2_points)
    l1_start_segment, l1_end_segment = segment_line_based_on_merged_area(line1, merged_line1_points)
    
    # choose the longer segment between line1 and line2 to be the final start segment and end segment
    start_segment = get_longer_segmenent_to_merged_points(l1_start_segment, l2_start_segment, merged_line2_points, segment_type="start")
    end_segment = get_longer_segmenent_to_merged_points(l1_end_segment, l2_end_segment, merged_line2_points, segment_type="end")
    merged_polyline = np.concatenate((start_segment, merged_line2_points, end_segment), axis=0)
    
    # update the merged_lines
    merged_lines[merged_line_index] = merged_polyline
    
    return merged_lines

def merge_divider(vecs=None, thickness=1):
    merged_lines = []
    for vec in vecs:
        
        # if the merged_lines is empty, add the first line
        if len(merged_lines) == 0:
            merged_lines.append(vec)
            continue
        
        # thicken the vec (the new line) and the merged_lines calculate the max IOU between the new line and the merged_lines
        iou = []
        for one_merged_line in merged_lines:
            line1 = LineString(one_merged_line)
            line2 = LineString(vec)
            thick_line1 = line1.buffer(thickness)
            thick_line2 = line2.buffer(thickness)
            intersection = thick_line1.intersection(thick_line2)
            iou.append(intersection.area / thick_line2.area)
        
        # If the max IOU is 0, add the new line to the merged_lines
        if max(iou) == 0:
            merged_lines.append(vec)
        # If IOU is not 0, merge the new line with the line in the merged_lines
        else:
            merged_lines = iou_merge_divider(merged_lines, vec, thickness=thickness)

           
    return merged_lines

def merge_boundary(vecs=None, thickness=1, iou_threshold=0.95):
    merged_lines = []
    for vec in vecs:

        # if the merged_lines is empty, add the first line
        if len(merged_lines) == 0:
            merged_lines.append(vec)
            continue
        
        # thicken the vec (the new line) and the merged_lines calculate the max IOU between the new line and the merged_lines
        iou = []
        for one_merged_line in merged_lines:
            line1 = LineString(one_merged_line)
            line2 = LineString(vec)
            thick_line1 = line1.buffer(thickness)
            thick_line2 = line2.buffer(thickness)
            intersection = thick_line1.intersection(thick_line2)
            iou.append(intersection.area / thick_line2.area)
        
        # If the max IOU larger than the threshold, skip the new line
        if max(iou) > iou_threshold:
            continue
        
        # If IOU is not 0, merge the new line with the line in the merged_lines
        if max(iou) > 0:
            merged_lines = iou_merge_boundry(merged_lines, vec, thickness=thickness)
        else:
            merged_lines.append(vec)
           
    return merged_lines

def get_consecutive_vectors_with_opt(prev_vectors=None,prev2curr_matrix=None,origin=None,roi_size=None, denormalize=False, clip=False):
    # transform prev vectors
    prev2curr_vectors = dict()
    for label, vecs in prev_vectors.items():
        if len(vecs) > 0:
            vecs = np.stack(vecs, 0)
            vecs = torch.tensor(vecs)
            N, num_points, _ = vecs.shape
            if denormalize:
                denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2)
            else:
                denormed_vecs = vecs
            denormed_vecs = torch.cat([
                denormed_vecs,
                denormed_vecs.new_zeros((N, num_points, 1)), # z-axis
                denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim
            ], dim=-1) # (num_prop, num_pts, 4)

            transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float()
            normed_vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2)
            if clip:
                normed_vecs = torch.clip(normed_vecs, min=0., max=1.)
            prev2curr_vectors[label] = normed_vecs
        else:
            prev2curr_vectors[label] = vecs

    # convert to ego space for visualization
    for label in prev2curr_vectors:
        if len(prev2curr_vectors[label]) > 0:
            prev2curr_vectors[label] = prev2curr_vectors[label] * roi_size + origin
    return prev2curr_vectors

def get_prev2curr_vectors(vecs=None, prev2curr_matrix=None,origin=None,roi_size=None, denormalize=False, clip=False):
    # transform prev vectors
    if len(vecs) > 0:
        vecs = np.stack(vecs, 0)
        vecs = torch.tensor(vecs)
        N, num_points, _ = vecs.shape
        if denormalize:
            denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2)
        else:
            denormed_vecs = vecs
        denormed_vecs = torch.cat([
            denormed_vecs,
            denormed_vecs.new_zeros((N, num_points, 1)), # z-axis
            denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim
        ], dim=-1) # (num_prop, num_pts, 4)

        transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float()
        vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2)
        if clip:
            vecs = torch.clip(vecs, min=0., max=1.)
        # vecs = vecs * roi_size + origin
    
    return vecs

def plot_fig_merged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args):
    os.makedirs(pred_save_folder, exist_ok=True)
  
    # key the current status of the instance, add into the dict when it first appears
    instance_bank = dict()

    # trace the path reversely, get the sub-sampled traj for visualizing the car
    pre_center = car_trajectory[-1][0]
    selected_traj_timesteps = []
    for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]):
        if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1:
            continue
        selected_traj_timesteps.append(len(car_trajectory)-1-timestep)
        pre_center = car_center
    selected_traj_timesteps = selected_traj_timesteps[::-1]

    image_list = [pred_save_folder + f'/{frame_timestep}.png' for frame_timestep in range(num_frames)]
    #save_t(len(image_list), pred_save_folder) # save the timestep text mp4 file

    # plot the figure at each frame
    for frame_timestep in range(num_frames):
        plt.figure(facecolor='lightgreen')
        fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10))
        ax = fig.add_subplot(1, 1, 1)
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        
        # setup the figure with car
        car_img = Image.open('resources/car-orange.png')
        faded_rate = np.linspace(0.2, 1, num=len(car_trajectory))
        pre_center = car_trajectory[0][0]

        for t in selected_traj_timesteps: # only plot the car at the selected timesteps
            if t > frame_timestep: # if the car has not appeared at this frame
                break
            car_center, rotation_degrees = car_trajectory[t]
            translation = transforms.Affine2D().translate(car_center[0], car_center[1])
            rotation = transforms.Affine2D().rotate_deg(rotation_degrees)
            rotation_translation = rotation + translation
            ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, alpha=faded_rate[t])
        
        for vec_tag, vec_all_frames in id_prev2curr_pred_vectors.items():
            vec_frame_info = id_prev2curr_pred_frame[vec_tag] 
            first_appear_frame = sorted(list(vec_frame_info.keys()))[0]

            need_merge = False
            if frame_timestep < first_appear_frame : # the instance has not appeared
                continue
            elif frame_timestep in vec_frame_info:
                need_merge = True
                vec_index_in_instance = vec_frame_info[frame_timestep]

            label, vec_glb_idx = vec_tag.split('_')
            label = int(label)
            vec_glb_idx = int(vec_glb_idx)

            if need_merge:
                curr_vec = vec_all_frames[vec_index_in_instance]
                curr_vec_polyline = LineString(curr_vec)
                if vec_tag not in instance_bank: # if the instance first appears
                    polylines = [curr_vec_polyline,]
                else: # if the instance has appeared before, polylines = previous merged polyline + current polyline
                    polylines = instance_bank[vec_tag] + [curr_vec_polyline,]
            else: # if the instance has not appeared in this frame
                polylines = instance_bank[vec_tag]

            if label == 0: # ped_crossing
                color = 'b'
            elif label == 1: # divider
                color = 'r'
            elif label == 2: # boundary
                color = 'g'
            
            if label == 0: # crossing, merged by convex hull
                if need_merge:
                    polygon = merge_corssing(polylines)
                    polygon = polygon.simplify(args.simplify)
                    vector = np.array(polygon.exterior.coords) 
                else: # if no new instance, use the previous merged polyline to plot
                    vector = np.array(polylines[0].coords) 

                pts = vector[:, :2]
                x = np.array([pt[0] for pt in pts])
                y = np.array([pt[1] for pt in pts])
                ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity)
                ax.plot(x, y, "o", color=color, markersize=50)

                # update instance bank for ped
                updated_polyline = LineString(vector)
                instance_bank[vec_tag] = [updated_polyline, ]

            elif label == 1: # divider, merged fitting a polyline
                if need_merge:
                    polylines_vecs = [np.array(one_line.coords) for one_line in polylines]
                    polylines_vecs = merge_divider(polylines_vecs)
                else:  # if no new instance, use the previous merged polyline to plot
                    polylines_vecs = [np.array(line.coords) for line in polylines]

                for one_line in polylines_vecs:
                    one_line = np.array(LineString(one_line).simplify(args.simplify*2).coords)
                    pts = one_line[:, :2]
                    x = np.array([pt[0] for pt in pts])
                    y = np.array([pt[1] for pt in pts])
                    ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity)
                    ax.plot(x, y, "o", color=color, markersize=50)

                # update instance bank for line
                updated_polylines = [LineString(vec) for vec in polylines_vecs]
                instance_bank[vec_tag] = updated_polylines

            elif label == 2: # boundary, do not merge
                if need_merge:
                    polylines_vecs = [np.array(one_line.coords) for one_line in polylines]
                    polylines_vecs = merge_boundary(polylines_vecs)
                else: # if no new instance, use the previous merged polyline to plot
                    polylines_vecs = [np.array(line.coords) for line in polylines]

                for one_line in polylines_vecs:
                    one_line = np.array(LineString(one_line).simplify(args.simplify).coords)
                    pts = one_line[:, :2]
                    x = np.array([pt[0] for pt in pts])
                    y = np.array([pt[1] for pt in pts])
                    ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity)
                    ax.plot(x, y, "o", color=color, markersize=50)

                # update instance bank for line
                updated_polylines = [LineString(vec) for vec in polylines_vecs]
                instance_bank[vec_tag] = updated_polylines
        
        pred_save_path = pred_save_folder + f'/{frame_timestep}.png'
        plt.grid(False)
        plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi)
        plt.clf() 
        plt.close(fig)
        print("image saved to : ", pred_save_path)

    image_list = [pred_save_folder + f'/{frame_timestep}.png' for frame_timestep in range(num_frames)]
    gif_output_path = pred_save_folder + '/vis.gif'
    save_as_video(image_list, gif_output_path)

# merge the vectors across all frames and plot the merged vectors
def plot_fig_merged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args):
                
    # setup the figure with car
    fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    car_img = Image.open('resources/car-orange.png')
    
    faded_rate = np.linspace(0.2, 1, num=len(car_trajectory))

    # trace the path reversely, get the sub-sampled traj for visualizing the car
    pre_center = car_trajectory[-1][0]
    selected_traj = []
    selected_timesteps = []
    for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]):
        if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1:
            continue
        selected_traj.append([car_center, rotation_degrees])
        selected_timesteps.append(len(car_trajectory)-1-timestep)
        pre_center = car_center
    selected_traj = selected_traj[::-1]
    selected_timesteps = selected_timesteps[::-1]

    for selected_t, (car_center, rotation_degrees) in zip(selected_timesteps, selected_traj):
        translation = transforms.Affine2D().translate(car_center[0], car_center[1])
        rotation = transforms.Affine2D().rotate_deg(rotation_degrees)
        rotation_translation = rotation + translation
        ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, 
                alpha=faded_rate[selected_t])
    
    # merge the vectors across all frames
    for tag, vecs in id_prev2curr_pred_vectors.items():
        label, vec_glb_idx = tag.split('_')
        label = int(label)
        vec_glb_idx = int(vec_glb_idx)
        

        if label == 0: # ped_crossing
            color = 'b'
        elif label == 1: # divider
            color = 'r'
        elif label == 2: # boundary
            color = 'g'
    
        # get the vectors belongs to the same instance
        polylines = []
        for vec in vecs:
            polylines.append(LineString(vec))
        if len(polylines) <= 0:
            continue

        if label == 0: # crossing, merged by convex hull
            polygon = merge_corssing(polylines)
            if polygon.area < 2:
                continue
            polygon = polygon.simplify(args.simplify)
            vector = np.array(polygon.exterior.coords) 
            pts = vector[:, :2]
            x = np.array([pt[0] for pt in pts])
            y = np.array([pt[1] for pt in pts])
            ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity)
            ax.plot(x, y, "o", color=color, markersize=50)
        elif label == 1: # divider, merged by interpolation
            polylines_vecs = [np.array(one_line.coords) for one_line in polylines]
            polylines_vecs = merge_divider(polylines_vecs)
            for one_line in polylines_vecs:
                one_line = np.array(LineString(one_line).simplify(args.simplify).coords)
                pts = one_line[:, :2]
                x = np.array([pt[0] for pt in pts])
                y = np.array([pt[1] for pt in pts])
                ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity)
                ax.plot(x, y, "o", color=color, markersize=50)
        elif label == 2: # boundary, merged by interpolation
            polylines_vecs = [np.array(one_line.coords) for one_line in polylines]
            polylines_vecs = merge_boundary(polylines_vecs)
            for one_line in polylines_vecs:
                one_line = np.array(LineString(one_line).simplify(args.simplify).coords)
                pts = one_line[:, :2]
                x = np.array([pt[0] for pt in pts])
                y = np.array([pt[1] for pt in pts])
                ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity)
                ax.plot(x, y, "o", color=color, markersize=50)

    plt.grid(False)
    plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi)
    plt.clf() 
    plt.close(fig)
    print("image saved to : ", pred_save_path)

def plot_fig_unmerged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args):

    os.makedirs(pred_save_folder, exist_ok=True)

    # trace the path reversely, get the sub-sampled traj for visualizing the car
    pre_center = car_trajectory[-1][0]
    selected_traj_timesteps = []
    for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]):
        if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1:
            continue
        selected_traj_timesteps.append(len(car_trajectory)-1-timestep)
        pre_center = car_center
    selected_traj_timesteps = selected_traj_timesteps[::-1]

    # setup the figure with car
    fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    car_img = Image.open('resources/car-orange.png')


    for frame_timestep in range(num_frames):

        faded_rate = np.linspace(0.2, 1, num=len(car_trajectory))
        if frame_timestep in selected_traj_timesteps:
            car_center, rotation_degrees = car_trajectory[frame_timestep]
            translation = transforms.Affine2D().translate(car_center[0], car_center[1])
            rotation = transforms.Affine2D().rotate_deg(rotation_degrees)
            rotation_translation = rotation + translation
            ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, alpha=faded_rate[frame_timestep])
        
        # plot the vectors
        for vec_tag, vec_all_frames in id_prev2curr_pred_vectors.items():
            vec_frame_info = id_prev2curr_pred_frame[vec_tag]
            if frame_timestep not in vec_frame_info: # the instance has not appeared
                continue
            else:
                vec_index_in_instance = vec_frame_info[frame_timestep]
            
            curr_vec = vec_all_frames[vec_index_in_instance]
            label, vec_glb_idx = vec_tag.split('_')
            label = int(label)
            vec_glb_idx = int(vec_glb_idx)
            
            if label == 0: # ped_crossing
                color = 'b'
            elif label == 1: # divider
                color = 'r'
            elif label == 2: # boundary
                color = 'g'
            
            polyline = LineString(curr_vec)
            vector = np.array(polyline.coords)
            pts = vector[:, :2]
            x = np.array([pt[0] for pt in pts])
            y = np.array([pt[1] for pt in pts])
            ax.plot(x, y, 'o-', color=color, linewidth=20, markersize=50)

        pred_save_path = pred_save_folder + f'/{frame_timestep}.png'
        plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi)
        print("image saved to : ", pred_save_path)

    plt.grid(False)
    plt.clf() 
    plt.close(fig)
    image_list = [pred_save_folder + f'/{frame_timestep}.png' for frame_timestep in range(num_frames)]
    gif_output_path = pred_save_folder + '/vis.gif'
    save_as_video(image_list, gif_output_path)


def plot_fig_unmerged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args):
                
    # setup the figure with car
    fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    car_img = Image.open('resources/car-orange.png')
    
    # trace the path reversely, get the sub-sampled traj for visualizing the car 
    pre_center = car_trajectory[-1][0]
    selected_traj = []
    selected_timesteps = []
    for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]):
        if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1:
            continue
        selected_traj.append([car_center, rotation_degrees])
        selected_timesteps.append(len(car_trajectory)-1-timestep)
        pre_center = car_center
    selected_traj = selected_traj[::-1]
    selected_timesteps = selected_timesteps[::-1]

    # plot the car trajectory with faded_rate 
    faded_rate = np.linspace(0.2, 1, num=len(car_trajectory))
    for selected_t, (car_center, rotation_degrees) in zip(selected_timesteps, selected_traj):
        translation = transforms.Affine2D().translate(car_center[0], car_center[1])
        rotation = transforms.Affine2D().rotate_deg(rotation_degrees)
        rotation_translation = rotation + translation
        ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, 
                alpha=faded_rate[selected_t])
    
    # plot the unmerged vectors (all the predicted/ gt vectors)
    for tag, vecs in id_prev2curr_pred_vectors.items():
        label, vec_glb_idx = tag.split('_')
        label = int(label)
        vec_glb_idx = int(vec_glb_idx)

        if label == 0: # ped_crossing
            color = 'b'
        elif label == 1: # divider
            color = 'r'
        elif label == 2: # boundary
            color = 'g'
        
        polylines = []
        for vec in vecs:
            polylines.append(LineString(vec))
            
        if len(polylines) <= 0:
            continue

        for one_line in polylines:
            vector = np.array(one_line.coords)
            pts = vector[:, :2]
            x = np.array([pt[0] for pt in pts])
            y = np.array([pt[1] for pt in pts])
            ax.plot(x, y, 'o-', color=color, linewidth=20, markersize=50)
        

    plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi)
    plt.clf()  
    plt.close(fig)
    print("image saved to : ", pred_save_path)

# the timestep text visualization
def save_t(t_max, main_save_folder):
    txt_save_folder = os.path.join(main_save_folder, 'txt')
    os.makedirs(txt_save_folder, exist_ok=True)
    t = range(t_max)

    for i in t:
        fig, ax = plt.subplots(figsize=(2, 1), dpi=300)  # Increase DPI for higher resolution
        ax.text(0.1, 0.5, f't = {i}', fontsize=40,ha='left', va='center')
        ax.axis('off')
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        fig.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Remove margins around the text
        plt.savefig(f'{txt_save_folder}/text_{i}.png',pad_inches=0)
        plt.close(fig)

    text_images = [f'{txt_save_folder}/text_{i}.png' for i in t]
    frames = [imageio.imread(img_path) for img_path in text_images]
    mp4_output_path = os.path.join(main_save_folder, 'text.mp4')
    imageio.mimsave(mp4_output_path, frames, fps=10)  # fps controls the speed of the video
    print("mp4 saved to : ", mp4_output_path)

def save_as_video(image_list, mp4_output_path, scale=None):
    mp4_output_path = mp4_output_path.replace('.gif','.mp4')
    images = [Image.fromarray(imageio.imread(img_path)).convert("RGBA") for img_path in image_list]

    if scale is not None:
        w, h = images[0].size
        images = [img.resize((int(w*scale), int(h*scale)), Image.Resampling.LANCZOS) for img in images]
    # images = [Image.new('RGBA', images[0].size, (255, 255, 255, 255))] + images
    
    try:
        imageio.mimsave(mp4_output_path, images,  format='MP4',fps=10)
    except ValueError: # in case the shapes are not the same, have to manually adjust
        resized_images = [img.resize(images[0].size, Image.Resampling.LANCZOS) for img in images]
        print('Size not all the same, manually adjust...')
        imageio.mimsave(mp4_output_path, resized_images,  format='MP4',fps=10)
    print("mp4 saved to : ", mp4_output_path)


def vis_pred_data(scene_name="", pred_results=None, origin=None, roi_size=None, args=None):
    

    # get the item index of the scene
    index_list = []
    for index in range(len(pred_results)):
        if pred_results[index]["scene_name"] == scene_name:
            index_list.append(index)
    
    car_trajectory = []
    id_prev2curr_pred_vectors = defaultdict(list)
    id_prev2curr_pred_frame_info = defaultdict(list)
    id_prev2curr_pred_frame = defaultdict(list)

    # iterate through each frame
    last_index = index_list[-1]
    for index in index_list:
        
        vectors = np.array(pred_results[index]["vectors"]).reshape((len(np.array(pred_results[index]["vectors"])), 20, 2))
        if abs(vectors.max()) <= 1:
            curr_vectors = vectors * roi_size + origin
        else:
            curr_vectors = vectors
            
        # get the transformation matrix of the last frame
        prev_e2g_trans =  torch.tensor(pred_results[index]['meta']['ego2global_translation'], dtype=torch.float64)
        prev_e2g_rot = torch.tensor(pred_results[index]['meta']['ego2global_rotation'], dtype=torch.float64)
        curr_e2g_trans  = torch.tensor(pred_results[last_index]['meta']['ego2global_translation'], dtype=torch.float64)
        curr_e2g_rot = torch.tensor(pred_results[last_index]['meta']['ego2global_rotation'], dtype=torch.float64)
        prev_e2g_matrix = torch.eye(4, dtype=torch.float64)
        prev_e2g_matrix[:3, :3] = prev_e2g_rot
        prev_e2g_matrix[:3, 3] = prev_e2g_trans

        curr_g2e_matrix = torch.eye(4, dtype=torch.float64)
        curr_g2e_matrix[:3, :3] = curr_e2g_rot.T
        curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans)
        
        prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix
        prev2curr_pred_vectors = get_prev2curr_vectors(curr_vectors, prev2curr_matrix,origin,roi_size,False,False)
        prev2curr_pred_vectors = prev2curr_pred_vectors * roi_size + origin
        
        rotation_degrees = np.degrees(np.arctan2(prev2curr_matrix[:3, :3][1, 0], prev2curr_matrix[:3, :3][0, 0]))
        car_center = get_prev2curr_vectors(np.array((0,0)).reshape(1,1,2), prev2curr_matrix,origin,roi_size,False,False)* roi_size + origin
        car_trajectory.append([car_center.squeeze(), rotation_degrees])
        
        for i, (label, vec_glb_idx) in enumerate(zip(pred_results[index]['labels'], pred_results[index]['global_ids'])):
            dict_key = "{}_{}".format(label, vec_glb_idx)
            id_prev2curr_pred_vectors[dict_key].append(prev2curr_pred_vectors[i])
            id_prev2curr_pred_frame_info[dict_key].append([pred_results[index]["local_idx"], len(id_prev2curr_pred_frame[dict_key])])

        for key, frame_info in id_prev2curr_pred_frame_info.items():
            frame_localIdx = dict()
            for frame_time, local_index in frame_info:
                frame_localIdx[frame_time] = local_index
            id_prev2curr_pred_frame[key] = frame_localIdx
        
    
    # sort the id_prev2curr_pred_vectors
    id_prev2curr_pred_vectors = {key: id_prev2curr_pred_vectors[key] for key in sorted(id_prev2curr_pred_vectors)}

    
    # set the size of the image
    x_min = -roi_size[0] / 2
    x_max = roi_size[0] / 2
    y_min = -roi_size[1] / 2
    y_max = roi_size[1] / 2

    all_points = []
    for vecs in id_prev2curr_pred_vectors.values():
        points = np.concatenate(vecs, axis=0)
        all_points.append(points)
    all_points = np.concatenate(all_points, axis=0)

    x_min = min(x_min, all_points[:,0].min())
    x_max = max(x_max, all_points[:,0].max())
    y_min = min(y_min, all_points[:,1].min())
    y_max = max(y_max, all_points[:,1].max())

    scene_dir = os.path.join(args.out_dir, scene_name)
    os.makedirs(scene_dir,exist_ok=True)
    
    if args.per_frame_result:
        num_frames = len(index_list)
        pred_save_folder = os.path.join(scene_dir, f'pred_merged_per_frame')
        plot_fig_merged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args)
        pred_save_folder = os.path.join(scene_dir, f'pred_unmerged_per_frame')
        plot_fig_unmerged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args)
    pred_save_path = os.path.join(scene_dir, f'pred_unmerged.png')
    plot_fig_unmerged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args)
    pred_save_path = os.path.join(scene_dir, f'pred_merged.png')
    plot_fig_merged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args)
    comb_save_path = os.path.join(scene_dir, f'pred_comb.png')
    image_paths = [os.path.join(scene_dir, f'pred_merged.png'), os.path.join(scene_dir, f'pred_unmerged.png')]
    labels = ['Merged', 'Unmerged']
    combine_images_with_labels(image_paths, labels, comb_save_path)
    print("image saved to : ", comb_save_path)

def vis_gt_data(scene_name, args, dataset, gt_data, origin, roi_size):

    gt_info = gt_data[scene_name]
    gt_info_list = []
    ids_info = []

    # get the item index of the sample
    for index, one_idx in enumerate(gt_info["sample_ids"]):
        gt_info_list.append(dataset[one_idx])
        ids_info.append(gt_info["instance_ids"][index])

    car_trajectory = []
    scene_dir = os.path.join(args.out_dir,scene_name)
    os.makedirs(scene_dir,exist_ok=True)

    # key : label, vec_glb_idx ; value : list of vectors in the last frame's coordinate
    id_prev2curr_pred_vectors = defaultdict(list)
    # dict to store some information of the vectors
    id_prev2curr_pred_frame_info = defaultdict(list) 
    # key : label, vec_glb_idx ; value : {frame_time : idx of the vector; idx range from 0 to the number of vectors of the same instance }
    id_prev2curr_pred_frame = defaultdict(dict)

    scene_len = len(gt_info_list)
    for idx in range(scene_len):
        curr_vectors = dict()
        # denormalize the vectors
        for label, vecs in gt_info_list[idx]['vectors'].data.items():
            if len(vecs) > 0: # if vecs != []
                curr_vectors[label] = vecs * roi_size + origin
            else:
                curr_vectors[label] = vecs
        
        # get the transformation matrix of the last frame
        prev_e2g_trans = torch.tensor(gt_info_list[idx]['img_metas'].data['ego2global_translation'], dtype=torch.float64)
        prev_e2g_rot = torch.tensor(gt_info_list[idx]['img_metas'].data['ego2global_rotation'], dtype=torch.float64)
        curr_e2g_trans  = torch.tensor(gt_info_list[scene_len-1]['img_metas'].data['ego2global_translation'], dtype=torch.float64)
        curr_e2g_rot = torch.tensor(gt_info_list[scene_len-1]['img_metas'].data['ego2global_rotation'], dtype=torch.float64)
        prev_e2g_matrix = torch.eye(4, dtype=torch.float64)
        prev_e2g_matrix[:3, :3] = prev_e2g_rot
        prev_e2g_matrix[:3, 3] = prev_e2g_trans

        curr_g2e_matrix = torch.eye(4, dtype=torch.float64)
        curr_g2e_matrix[:3, :3] = curr_e2g_rot.T
        curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans)
        
        # get the transformed vectors from current frame to the last frame
        prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix
        prev2curr_pred_vectors = get_consecutive_vectors_with_opt(curr_vectors,prev2curr_matrix,origin,roi_size,False,False)
        for label, id_info in ids_info[idx].items():
            for vec_local_idx, vec_glb_idx in id_info.items():
                dict_key = "{}_{}".format(label, vec_glb_idx)
                id_prev2curr_pred_vectors[dict_key].append(prev2curr_pred_vectors[label][vec_local_idx])
                # gt_info_list[idx]["seq_info"].data[1] stores the frame time that the vector appears
                id_prev2curr_pred_frame_info[dict_key].append([gt_info_list[idx]["seq_info"].data[1], len(id_prev2curr_pred_frame[dict_key])]) # set len(id_prev2curr_pred_frame[dict_key]) to be the index of the vector belongs to the same instance
        for key, frame_info in id_prev2curr_pred_frame_info.items():
            frame_localIdx = dict()
            for frame_time, local_index in frame_info:
                frame_localIdx[frame_time] = local_index
            id_prev2curr_pred_frame[key] = frame_localIdx
        
        rotation_degrees = np.degrees(np.arctan2(prev2curr_matrix[:3, :3][1, 0], prev2curr_matrix[:3, :3][0, 0]))
        # get the center of the car in the last frame's coordinate
        car_center = get_prev2curr_vectors(np.array((0,0)).reshape(1,1,2), prev2curr_matrix,origin,roi_size,False,False)* roi_size + origin
        car_trajectory.append([car_center.squeeze(), rotation_degrees])

    # sort the id_prev2curr_pred_vectors by label and vec_glb_idx
    id_prev2curr_pred_vectors = {key: id_prev2curr_pred_vectors[key] for key in sorted(id_prev2curr_pred_vectors)}

    # get the x_min, x_max, y_min, y_max for the figure size
    x_min = -roi_size[0] / 2
    x_max = roi_size[0] / 2
    y_min = -roi_size[1] / 2
    y_max = roi_size[1] / 2

    all_points = []
    for vecs in id_prev2curr_pred_vectors.values():
        points = np.concatenate(vecs, axis=0)
        all_points.append(points)
    all_points = np.concatenate(all_points, axis=0)

    x_min = min(x_min, all_points[:,0].min())
    x_max = max(x_max, all_points[:,0].max())
    y_min = min(y_min, all_points[:,1].min())
    y_max = max(y_max, all_points[:,1].max())

    scene_dir = os.path.join(args.out_dir,scene_name)
    os.makedirs(scene_dir,exist_ok=True)

    # if visulize the per frame result
    if args.per_frame_result:
        pred_save_folder = os.path.join(scene_dir, f'gt_merged_per_frame')
        plot_fig_merged_per_frame(len(gt_info_list), car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args)
        pred_save_folder = os.path.join(scene_dir, f'gt_unmerged_per_frame')
        plot_fig_unmerged_per_frame(len(gt_info_list), car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args)
    # plot result for across all frames
    pred_save_path = os.path.join(scene_dir, f'gt_unmerged.png')
    plot_fig_unmerged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args)
    pred_save_path = os.path.join(scene_dir, f'gt_merged.png')
    plot_fig_merged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args)

    # combine the merged and unmerged images into one plot for comparison
    comb_save_path = os.path.join(scene_dir, f'gt_comb.png')
    image_paths = [os.path.join(scene_dir, f'gt_merged.png'), os.path.join(scene_dir, f'gt_unmerged.png')]
    labels = ['Merged', 'Unmerged']
    combine_images_with_labels(image_paths, labels, comb_save_path)
    print("image saved to : ", comb_save_path)


def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    import_plugin(cfg)
    dataset = build_dataset(cfg.match_config)

    scene_name2idx = {}
    scene_name2token = {}
    
    for idx, sample in enumerate(dataset.samples):
        scene = sample['scene_name']
        if scene not in scene_name2idx:
            scene_name2idx[scene] = []
            scene_name2token[scene] = []
        scene_name2idx[scene].append(idx)

    # load the GT data
    if args.option == "vis-gt": 
        data = mmcv.load(args.data_path)
    # load the prediction data
    elif args.option == "vis-pred":
        with open(args.data_path,'rb') as fp:
            data = pickle.load(fp)

    all_scene_names = sorted(list(scene_name2idx.keys()))

    roi_size = torch.tensor(cfg.roi_size).numpy()
    origin = torch.tensor(cfg.pc_range[:2]).numpy()

    for scene_name in all_scene_names:
        if args.scene_id is not None and scene_name not in args.scene_id:
            continue
        scene_dir = os.path.join(args.out_dir,scene_name)
        if os.path.exists(scene_dir) and len(os.listdir(scene_dir)) > 0 and not args.overwrite:
            print(f"Scene {scene_name} already generated, skipping...")
            continue
        os.makedirs(scene_dir,exist_ok=True)

        if args.option == "vis-gt":
            # visualize the GT data
            vis_gt_data(scene_name=scene_name, args=args, dataset=dataset, gt_data=data, origin=origin, roi_size=roi_size)
        elif args.option == "vis-pred":
            # visualize the prediction results
            vis_pred_data(scene_name=scene_name, pred_results=data, origin=origin, roi_size=roi_size, args=args)
        else:
            raise ValueError('Invalid visualization option {}'.format(args.option))


if __name__ == '__main__':
    main() 

================================================
FILE: tools/visualization/vis_per_frame.py
================================================
import sys
import os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.dirname(SCRIPT_DIR))

import argparse     
import mmcv
from mmcv import Config
import os
from mmdet3d.datasets import build_dataset
import torch
import numpy as np
from PIL import Image
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import imageio
from tracking.cmap_utils.match_utils import *


def parse_args():
    parser = argparse.ArgumentParser(
        description='Visualize groundtruth and results')
    parser.add_argument('config', help='config file path')
    parser.add_argument(
        '--out_dir', 
        required=True,
        default='demo',
        help='directory where visualize results will be saved')
    parser.add_argument(
        '--data_path',
        required=True,
        default="",
        help='directory to submission file')
    parser.add_argument(
        '--scene_id',
        type=str, 
        nargs='+',
        default=None,
        help='scene_id to visulize')
    parser.add_argument(
        '--option',
        default="vis-gt",
        help='vis-gt or vis-pred')
    parser.add_argument(
        '--line_opacity',
        default=0.75,
        type=float,
        help='Line simplification tolerance'
    )
    parser.add_argument(
        '--overwrite',
        default=1,
        type=int,
        help='whether to overwrite the existing images'
    )
    parser.add_argument(
        '--dpi',
        default=20,
        type=int,
        help='whether to merge boundary lines'
    )
    
    args = parser.parse_args()

    return args

def save_as_video(image_list, mp4_output_path, scale=None):
    mp4_output_path = mp4_output_path.replace('.gif','.mp4')
    images = [Image.fromarray(img).convert("RGBA") for img in image_list]
    if scale is not None:
        w, h = images[0].size
        images = [img.resize((int(w*scale), int(h*scale)), Image.Resampling.LANCZOS) for img in images]
    images = [Image.new('RGBA', images[0].size, (255, 255, 255, 255))] + images
    try:
        imageio.mimsave(mp4_output_path, images,  format='MP4',fps=10)
    except ValueError: # in case the shapes are not the same, have to manually adjust
        resized_images = [img.resize(images[0].size, Image.Resampling.LANCZOS) for img in images]
        print('Size not all the same, manually adjust...')
        imageio.mimsave(mp4_output_path, resized_images,  format='MP4',fps=10)
    print("mp4 saved to : ", mp4_output_path)

def plot_one_frame_results(vectors, id_info, roi_size, scene_dir, args):                
    # setup the figure with car
    plt.figure(figsize=(roi_size[0], roi_size[1]))
    plt.xlim(-roi_size[0] / 2, roi_size[0] / 2)
    plt.ylim(-roi_size[1] / 2, roi_size[1] / 2)
    plt.axis('off')
    plt.autoscale(False)
    car_img = Image.open('resources/car-orange.png')
    plt.imshow(car_img, extent=[-2.2, 2.2, -2, 2])
    
    for label, vecs in vectors.items():
        if label == 0: # ped_crossing
            color = 'b'
            label_text = 'P'
        elif label == 1: # divider
            color = 'r'
            label_text = 'D'
        elif label == 2: # boundary
            color = 'g'
            label_text = 'B'
        
        if len(vecs) == 0:
            continue

        for vec_idx, vec in enumerate(vecs):
            pts = vec[:, :2]
            x = np.array([pt[0] for pt in pts])
            y = np.array([pt[1] for pt in pts])
            plt.plot(x, y, 'o-', color=color, linewidth=25, markersize=20, alpha=args.line_opacity)
            vec_id = id_info[label][vec_idx]
            mid_idx = len(x) // 2

            # Put instance id, prevent the text from changing fig size...
            if -roi_size[1]/2 <= y[mid_idx] < -roi_size[1]/2 + 2:
                text_y = y[mid_idx] + 2
            elif roi_size[1]/2 - 2 < y[mid_idx] <= roi_size[1]/2:
                text_y = y[mid_idx] - 2
            else:
                text_y = y[mid_idx]
            
            if -roi_size[0]/2 <= x[mid_idx] < -roi_size[0]/2 + 4:
                text_x = x[mid_idx] + 4
            elif roi_size[0]/2 - 4 < x[mid_idx] <= roi_size[0]/2:
                text_x = x[mid_idx] - 4
            else:
                text_x = x[mid_idx]

            plt.text(text_x, text_y, f'{label_text}{vec_id}', fontsize=80, color=color)
        
    save_path = os.path.join(scene_dir, 'temp.png')
    plt.savefig(save_path, bbox_inches='tight', transparent=False, dpi=args.dpi)
    plt.clf()  
    plt.close()

    viz_image = imageio.imread(save_path)
    return viz_image
    
def vis_pred_data(scene_name, args, pred_results, origin,roi_size):
    
    # get the item index of the scene
    scene_idx = defaultdict(list)
    
    for index in range(len(pred_results)):
        scene_idx[pred_results[index]["scene_name"]].append(index)
        
    index_list = scene_idx[scene_name]
    
    scene_dir = os.path.join(args.out_dir,scene_name)
    os.makedirs(scene_dir,exist_ok=True)

    g2l_id_mapping = dict()
    label_ins_counter = {0:0, 1:0, 2:0}

    all_viz_images = []

    # iterate through each frame of the pred sequence
    for index in index_list:
        vectors = np.array(pred_results[index]["vectors"]).reshape((len(np.array(pred_results[index]["vectors"])), 20, 2))
        # some results are normalized, some not...
        if np.abs(vectors).max() <= 1: 
            vectors = vectors * roi_size + origin
        labels = np.array(pred_results[index]["labels"])
        global_ids = np.array(pred_results[index]["global_ids"])

        per_label_results = defaultdict(list) 

        for ins_idx in range(len(vectors)):
            label = int(labels[ins_idx])
            global_id = int(global_ids[ins_idx])
            if global_id not in g2l_id_mapping:
                local_idx = label_ins_counter[label]
                g2l_id_mapping[global_id] = (label, local_idx)
                label_ins_counter[label] += 1
            else:
                if label == g2l_id_mapping[global_id][0]:
                    local_idx = g2l_id_mapping[global_id][1]
                else: 
                    # label changes for a tracked instance (can happen in our method)
                    # need to update the global id info
                    local_idx = label_ins_counter[label]
                    g2l_id_mapping[global_id] = (label, local_idx)
                    label_ins_counter[label] += 1

            per_label_results[label].append([vectors[ins_idx], global_id, local_idx])

        curr_vectors = defaultdict(list) 
        id_info = dict()
        for label, results in per_label_results.items():
            vec_results = [item[0] for item in results]
            global_ids = [item[1] for item in results]
            local_ids = [item[2] for item in results]

            curr_vectors[label] = np.stack(vec_results, axis=0)
            id_info[label] = {idx:ins_id for idx, ins_id in enumerate(local_ids)}
        
        viz_image = plot_one_frame_results(curr_vectors, id_info, roi_size, scene_dir, args)
        all_viz_images.append(viz_image)
        
    gif_path = os.path.join(scene_dir, 'per_frame_pred.gif')
    save_as_video(all_viz_images, gif_path)
        
def vis_gt_data(scene_name, args, dataset, scene_name2idx, gt_data, origin, roi_size):
    gt_info = gt_data[scene_name]
    gt_info_list = []
    ids_info = []

    scene_dir = os.path.join(args.out_dir,scene_name)
    os.makedirs(scene_dir,exist_ok=True)

    for index, one_idx in enumerate(gt_info["sample_ids"]):
        gt_info_list.append(dataset[one_idx])
        ids_info.append(gt_info["instance_ids"][index])
    scene_len = len(gt_info_list)

    all_viz_images = []
    all_cam_images = {cam_name: [] for cam_name in dataset.samples[0]['cams'].keys()}

    for frame_idx in range(scene_len):
        global_idx = scene_name2idx[scene_name][frame_idx]
        # collect images for each camera
        cams = dataset.samples[global_idx]['cams']
        for cam, info in cams.items():
            img = imageio.imread(info['img_fpath'])
            all_cam_images[cam].append(img)
        # collect vectors for each frame
        curr_vectors = dict()
        for label, vecs in gt_info_list[frame_idx]['vectors'].data.items():
            if len(vecs) > 0:
                curr_vectors[label] = vecs * roi_size + origin
            else:
                curr_vectors[label] = vecs
        
        id_info = ids_info[frame_idx]

        viz_image = plot_one_frame_results(curr_vectors, id_info, roi_size, scene_dir, args)
        all_viz_images.append(viz_image)
    
    gif_path = os.path.join(scene_dir, 'per_frame_gt.gif')
    save_as_video(all_viz_images, gif_path)
    
    for cam_name, image_list in all_cam_images.items():
        gif_path = os.path.join(scene_dir, f'{cam_name}.gif')
        save_as_video(image_list, gif_path, scale=0.3)
    
def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    import_plugin(cfg)
    dataset = build_dataset(cfg.match_config)

    scene_name2idx = {}
    scene_name2token = {}
    for idx, sample in enumerate(dataset.samples):
        scene = sample['scene_name']
        if scene not in scene_name2idx:
            scene_name2idx[scene] = []
            scene_name2token[scene] = []
        scene_name2idx[scene].append(idx)

    if args.data_path == "":
        data = {}
    elif args.option == "vis-gt": # visulize GT option
        data = mmcv.load(args.data_path)
    elif args.option == "vis-pred":
        with open(args.data_path,'rb') as fp:
            data = pickle.load(fp)

    all_scene_names = sorted(list(scene_name2idx.keys()))
    scene_info_list = []
    for single_scene_name in all_scene_names:
        scene_info_list.append((single_scene_name, args))

    roi_size = torch.tensor(cfg.roi_size).numpy()
    origin = torch.tensor(cfg.pc_range[:2]).numpy()
    
    for scene_name in all_scene_names:

        if args.scene_id is not None and scene_name not in args.scene_id:
            continue
        scene_dir = os.path.join(args.out_dir,scene_name)
        if os.path.exists(scene_dir) and len(os.listdir(scene_dir)) > 0 and not args.overwrite:
            print(f"Scene {scene_name} already generated, skipping...")
            continue
        os.makedirs(scene_dir,exist_ok=True)
        if args.option == "vis-gt":
            vis_gt_data(scene_name=scene_name, args=args, dataset=dataset, 
                scene_name2idx=scene_name2idx, gt_data=data,origin=origin,roi_size=roi_size)
        elif args.option == "vis-pred":
            vis_pred_data(scene_name=scene_name, args=args, pred_results=data, origin=origin, roi_size=roi_size)

if __name__ == '__main__':
    main()