Repository: Zzh-tju/CIoU
Branch: master
Commit: a9f589f28053
Files: 96
Total size: 19.5 MB
Directory structure:
gitextract_fc4uicbd/
├── .gitignore
├── LICENSE
├── README.md
├── README_zh-CN.md
├── backbone.py
├── data/
│ ├── __init__.py
│ ├── coco.py
│ ├── config.py
│ ├── grid.npy
│ └── scripts/
│ ├── COCO.sh
│ ├── COCO_test.sh
│ └── mix_sets.py
├── environment.yml
├── eval.py
├── external/
│ └── DCNv2/
│ ├── LICENSE
│ ├── README.md
│ ├── dcn_v2.py
│ ├── setup.py
│ ├── src/
│ │ ├── cpu/
│ │ │ ├── dcn_v2_cpu.cpp
│ │ │ └── vision.h
│ │ ├── cuda/
│ │ │ ├── dcn_v2_cuda.cu
│ │ │ ├── dcn_v2_im2col_cuda.cu
│ │ │ ├── dcn_v2_im2col_cuda.h
│ │ │ ├── dcn_v2_psroi_pooling_cuda.cu
│ │ │ └── vision.h
│ │ ├── dcn_v2.h
│ │ └── vision.cpp
│ └── test.py
├── layers/
│ ├── __init__.py
│ ├── box_utils.py
│ ├── functions/
│ │ ├── __init__.py
│ │ └── detection.py
│ ├── interpolate.py
│ ├── modules/
│ │ ├── __init__.py
│ │ └── multibox_loss.py
│ └── output_utils.py
├── run_coco_eval.py
├── scripts/
│ ├── augment_bbox.py
│ ├── bbox_recall.py
│ ├── cluster_bbox_sizes.py
│ ├── compute_masks.py
│ ├── convert_darknet.py
│ ├── convert_sbd.py
│ ├── eval.sh
│ ├── make_grid.py
│ ├── optimize_bboxes.py
│ ├── parse_eval.py
│ ├── plot_loss.py
│ ├── resume.sh
│ ├── save_bboxes.py
│ ├── train.sh
│ └── unpack_statedict.py
├── train.py
├── utils/
│ ├── __init__.py
│ ├── augmentations.py
│ ├── cython_nms.pyx
│ ├── functions.py
│ ├── logger.py
│ ├── nvinfo.py
│ └── timer.py
├── web/
│ ├── css/
│ │ ├── index.css
│ │ ├── list.css
│ │ ├── toggle.css
│ │ └── viewer.css
│ ├── dets/
│ │ ├── ssd300.json
│ │ ├── ssd550.json
│ │ ├── ssd550_resnet101.json
│ │ ├── test.json
│ │ ├── yolact_base.json
│ │ ├── yolact_darknet53.json
│ │ ├── yolact_im700.json
│ │ ├── yolact_resnet101_conv4.json
│ │ ├── yolact_resnet101_maskrcnn.json
│ │ ├── yolact_resnet101_maskrcnn_1.json
│ │ ├── yolact_resnet50.json
│ │ ├── yrm12.json
│ │ ├── yrm13.json
│ │ ├── yrm16_2.json
│ │ ├── yrm18.json
│ │ ├── yrm19.json
│ │ ├── yrm21.json
│ │ ├── yrm25_b.json
│ │ ├── yrm28_2_perfect.json
│ │ ├── yrm35_crop.json
│ │ └── yrm35_retina.json
│ ├── index.html
│ ├── iou.html
│ ├── scripts/
│ │ ├── index.js
│ │ ├── iou.js
│ │ ├── jquery.js
│ │ ├── js.cookie.js
│ │ ├── utils.js
│ │ └── viewer.js
│ ├── server.py
│ └── viewer.html
└── yolact.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
================================================
FILE: LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Copyright (C)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
Copyright (C)
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
.
================================================
FILE: README.md
================================================
### English | [简体中文](README_zh-CN.md)
## Complete-IoU Loss and Cluster-NMS for Improving Object Detection and Instance Segmentation.
Our paper is accepted by **IEEE Transactions on Cybernetics (TCYB)**.
### This repo is based on YOLACT++.
This is the code for our papers:
- [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https://arxiv.org/abs/1911.08287)
- [Enhancing Geometric Factors into Model Learning and Inference for Object Detection and Instance Segmentation](https://arxiv.org/abs/2005.03572)
```
@Inproceedings{zheng2020diou,
author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
title = {Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression},
booktitle = {The AAAI Conference on Artificial Intelligence (AAAI)},
pages = {12993--13000},
year = {2020}
}
@Article{zheng2021ciou,
author = {Zheng, Zhaohui and Wang, Ping and Ren, Dongwei and Liu, Wei and Ye, Rongguang and Hu, Qinghua and Zuo, Wangmeng},
title = {Enhancing Geometric Factors in Model Learning and Inference for Object Detection and Instance Segmentation},
journal = {IEEE Transactions on cybernetics},
volume = {52},
number = {8},
pages = {8574--8586},
year = {2021},
publisher = {IEEE}
}
```
## Description of Cluster-NMS and Its Usage
An example diagram of our Cluster-NMS, where X denotes IoU matrix which is calculated by `X=jaccard(boxes,boxes).triu_(diagonal=1) > nms_thresh` after sorted by score descending. (Here use 0,1 for visualization.)
The inputs of NMS are `boxes` with size [n,4] and `scores` with size [80,n]. (take coco as example)
There are two ways for NMS. One is that all classes have the same number of boxes. First, we use top k=200 to select the top 200 detections for every class. Then `boxes` will be [80,200,4]. Do Cluster-NMS and keep the boxes with `scores>0.01`. Finally, return top 100 boxes across all classes.
The other approach is that different classes have different numbers of boxes. First, we use a score threshold (e.g. 0.01) to filter out most low score detection boxes. It results in the number of remaining boxes in different classes may be different. Then put all the boxes together and sorted by score descending. (Note that the same box may appear more than once, because its scores of multiple classes are greater than the threshold 0.01.) Adding offset for all the `boxes` according to their class labels. (use `torch.arange(0,80)`.) For example, since the coordinates (x1,y1,x2,y2) of all the boxes are on interval (0,1). By adding offset, if a box belongs to class 61, its coordinates will on interval (60,61). After that, the IoU of boxes belonging to different classes will be 0. (because they are treated as different clusters.) Do Cluster-NMS and return top 100 boxes across all classes. (For this method, please refer to another our repository https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/detection/detection.py)
## Getting Started
### 1) New released! CIoU and Cluster-NMS
1. YOLACT (See [YOLACT](https://github.com/Zzh-tju/CIoU#YOLACT))
2. YOLOv3-pytorch [https://github.com/Zzh-tju/ultralytics-YOLOv3-Cluster-NMS](https://github.com/Zzh-tju/ultralytics-YOLOv3-Cluster-NMS)
3. YOLOv5 (Support batch mode Cluster-NMS. It will speed up NMS when turning on test-time augmentation like multi-scale testing.) [https://github.com/Zzh-tju/yolov5](https://github.com/Zzh-tju/yolov5)
4. SSD-pytorch [https://github.com/Zzh-tju/DIoU-SSD-pytorch](https://github.com/Zzh-tju/DIoU-SSD-pytorch)
### 2) DIoU and CIoU losses into Detection Algorithms
DIoU and CIoU losses are incorporated into state-of-the-art detection algorithms, including YOLO v3, SSD and Faster R-CNN.
The details of implementation and comparison can be respectively found in the following links.
1. YOLO v3 [https://github.com/Zzh-tju/DIoU-darknet](https://github.com/Zzh-tju/DIoU-darknet)
2. SSD [https://github.com/Zzh-tju/DIoU-SSD-pytorch](https://github.com/Zzh-tju/DIoU-SSD-pytorch)
3. Faster R-CNN [https://github.com/Zzh-tju/DIoU-pytorch-detectron](https://github.com/Zzh-tju/DIoU-pytorch-detectron)
4. Simulation Experiment [https://github.com/Zzh-tju/DIoU](https://github.com/Zzh-tju/DIoU)
# YOLACT
### Codes location and options
Please take a look at `ciou` function of [layers/modules/multibox_loss.py](layers/modules/multibox_loss.py) for our CIoU loss implementation in PyTorch.
Currently, NMS surports two modes: (See [eval.py](eval.py))
1. Cross-class mode, which ignores classes. (`cross_class_nms=True`, faster than per-class mode but with a slight performance drop.)
2. Per-class mode. (`cross_class_nms=False`)
Currently, NMS supports `fast_nms`, `cluster_nms`, `cluster_diounms`, `spm`, `spm_dist`, `spm_dist_weighted`.
See [layers/functions/detection.py](layers/functions/detection.py) for our Cluster-NMS implementation in PyTorch.
# Installation
In order to use YOLACT++, make sure you compile the DCNv2 code.
- Clone this repository and enter it:
```Shell
git clone https://github.com/Zzh-tju/CIoU.git
cd yolact
```
- Set up the environment using one of the following methods:
- Using [Anaconda](https://www.anaconda.com/distribution/)
- Run `conda env create -f environment.yml`
- Manually with pip
- Set up a Python3 environment (e.g., using virtenv).
- Install [Pytorch](http://pytorch.org/) 1.0.1 (or higher) and TorchVision.
- Install some other packages:
```Shell
# Cython needs to be installed before pycocotools
pip install cython
pip install opencv-python pillow pycocotools matplotlib
```
- If you'd like to train YOLACT, download the COCO dataset and the 2014/2017 annotations. Note that this script will take a while and dump 21gb of files into `./data/coco`.
```Shell
sh data/scripts/COCO.sh
```
- If you'd like to evaluate YOLACT on `test-dev`, download `test-dev` with this script.
```Shell
sh data/scripts/COCO_test.sh
```
- If you want to use YOLACT++, compile deformable convolutional layers (from [DCNv2](https://github.com/CharlesShang/DCNv2/tree/pytorch_1.0)).
Make sure you have the latest CUDA toolkit installed from [NVidia's Website](https://developer.nvidia.com/cuda-toolkit).
```Shell
cd external/DCNv2
python setup.py build develop
```
# Evaluation
Here are our YOLACT models (released on May 5th, 2020) along with their FPS on a GTX 1080 Ti and mAP on `coco 2017 val`:
The training is carried on two GTX 1080 Ti with command:
`
python train.py --config=yolact_base_config --batch_size=8
`
| Image Size | Backbone | Loss | NMS | FPS | box AP | mask AP | Weights |
|:----:|:-------------:|:-------:|:----:|:----:|:----:|:----:|----------------------------------------------------------------------------------------------------------------------|
| 550 | Resnet101-FPN | SL1 | Fast NMS | 30.6 | 31.5 | 29.1 |[SL1.pth](https://share.weiyun.com/5N840Hm) |
| 550 | Resnet101-FPN | CIoU | Fast NMS | 30.6 | 32.1 | 29.6 | [CIoU.pth](https://share.weiyun.com/5EtJ4dJ) |
To evalute the model, put the corresponding weights file in the `./weights` directory and run one of the following commands. The name of each config is everything before the numbers in the file name (e.g., `yolact_base` for `yolact_base_54_800000.pth`).
## Quantitative Results on COCO
```
# Quantitatively evaluate a trained model on the entire validation set. Make sure you have COCO downloaded as above.
# Output a COCOEval json to submit to the website or to use the run_coco_eval.py script.
# This command will create './results/bbox_detections.json' and './results/mask_detections.json' for detection and instance segmentation respectively.
python eval.py --trained_model=weights/yolact_base_54_800000.pth --output_coco_json
# You can run COCOEval on the files created in the previous command. The performance should match my implementation in eval.py.
python run_coco_eval.py
# To output a coco json file for test-dev, make sure you have test-dev downloaded from above and go
python eval.py --trained_model=weights/yolact_base_54_800000.pth --output_coco_json --dataset=coco2017_testdev_dataset
```
## Qualitative Results on COCO
```
# Display qualitative results on COCO. From here on I'll use a confidence threshold of 0.15.
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --display
```
## Cluster-NMS Using Benchmark on COCO
```
python eval.py --trained_model=weights/yolact_base_54_800000.pth --benchmark
```
#### Hardware
- 1 GTX 1080 Ti
- Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:------------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet101-FPN | CIoU | Fast NMS |**30.6**| 32.1 | 33.9 | 43.0 | 29.6 | 30.9 | 40.3 |
| 550 | Resnet101-FPN | CIoU | Original NMS | 11.5 | 32.5 | 34.1 | 45.1 | 29.7 | 31.0 | 41.7 |
| 550 | Resnet101-FPN | CIoU | Cluster-NMS | 28.8 | 32.5 | 34.1 | 45.2 | 29.7 | 31.0 | 41.7 |
| 550 | Resnet101-FPN | CIoU | SPM Cluster-NMS | 28.6 | 33.1 | 35.2 | 48.8 |**30.3**|**31.7**| 43.6 |
| 550 | Resnet101-FPN | CIoU | SPM + Distance Cluster-NMS | 27.1 | 33.2 | 35.2 |**49.2**| 30.2 |**31.7**|**43.8**|
| 550 | Resnet101-FPN | CIoU | SPM + Distance + Weighted Cluster-NMS | 26.5 |**33.4**|**35.5**| 49.1 |**30.3**| 31.6 |**43.8**|
The following table is evaluated by using their pretrained weight of YOLACT. ([yolact_resnet50_54_800000.pth](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/EUVpxoSXaqNIlssoLKOEoCcB1m0RpzGq_Khp5n1VX3zcUw))
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:-----------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet50-FPN | SL1 | Fast NMS |**41.6**| 30.2 | 31.9 | 42.0 | 28.0 | 29.1 | 39.4 |
| 550 | Resnet50-FPN | SL1 | Original NMS | 12.8 | 30.7 | 32.0 | 44.1 | 28.1 | 29.2 | 40.7 |
| 550 | Resnet50-FPN | SL1 | Cluster-NMS | 38.2 | 30.7 | 32.0 | 44.1 | 28.1 | 29.2 | 40.7 |
| 550 | Resnet50-FPN | SL1 | SPM Cluster-NMS | 37.7 | 31.3 | 33.2 | 48.0 |**28.8**|**29.9**| 42.8 |
| 550 | Resnet50-FPN | SL1 | SPM + Distance Cluster-NMS | 35.2 | 31.3 | 33.3 | 48.2 | 28.7 |**29.9**| 42.9 |
| 550 | Resnet50-FPN | SL1 | SPM + Distance + Weighted Cluster-NMS | 34.2 |**31.8**|**33.9**|**48.3**|**28.8**|**29.9**|**43.0**|
The following table is evaluated by using their pretrained weight of YOLACT. ([yolact_base_54_800000.pth](https://drive.google.com/file/d/1UYy3dMapbH1BnmtZU4WH1zbYgOzzHHf_/view?usp=sharing))
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:-----------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet101-FPN | SL1 | Fast NMS |**30.6**| 32.5 | 34.6 | 43.9 | 29.8 | 31.3 | 40.8 |
| 550 | Resnet101-FPN | SL1 | Original NMS | 11.9 | 32.9 | 34.8 | 45.8 | 29.9 | 31.4 | 42.1 |
| 550 | Resnet101-FPN | SL1 | Cluster-NMS | 29.2 | 32.9 | 34.8 | 45.9 | 29.9 | 31.4 | 42.1 |
| 550 | Resnet101-FPN | SL1 | SPM Cluster-NMS | 28.8 | 33.5 | 35.9 | 49.7 |**30.5**|**32.1**| 44.1 |
| 550 | Resnet101-FPN | SL1 | SPM + Distance Cluster-NMS | 27.5 | 33.5 | 35.9 |**50.2**| 30.4 | 32.0 |**44.3**|
| 550 | Resnet101-FPN | SL1 | SPM + Distance + Weighted Cluster-NMS | 26.7 |**34.0**|**36.6**| 49.9 |**30.5**| 32.0 |**44.3**|
The following table is evaluated by using their pretrained weight of YOLACT++. ([yolact_plus_base_54_800000.pth](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/EVQ62sF0SrJPrl_68onyHF8BpG7c05A8PavV4a849sZgEA))
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:-----------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet101-FPN | SL1 | Fast NMS |**25.1**| 35.8 | 38.7 | 45.5 | 34.4 | 36.8 | 42.6 |
| 550 | Resnet101-FPN | SL1 | Original NMS | 10.9 | 36.4 | 39.1 | 48.0 | 34.7 | 37.1 | 44.1 |
| 550 | Resnet101-FPN | SL1 | Cluster-NMS | 23.7 | 36.4 | 39.1 | 48.0 | 34.7 | 37.1 | 44.1 |
| 550 | Resnet101-FPN | SL1 | SPM Cluster-NMS | 23.2 | 36.9 | 40.1 | 52.8 |**35.0**| 37.5 |**46.3**|
| 550 | Resnet101-FPN | SL1 | SPM + Distance Cluster-NMS | 22.0 | 36.9 | 40.2 |**53.0**| 34.9 | 37.5 |**46.3**|
| 550 | Resnet101-FPN | SL1 | SPM + Distance + Weighted Cluster-NMS | 21.7 |**37.4**|**40.6**| 52.5 |**35.0**|**37.6**|**46.3**|
#### Note:
- Things we did but did not appear in the paper: SPM + Distance + Weighted Cluster-NMS. Here the box coordinate weighted average is only performed in `IoU> 0.8`. We searched that `IoU>0.5` is not good for YOLACT and `IoU>0.9` is almost same to `SPM + Distance Cluster-NMS`. (Refer to [CAD](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8265304) for the details of Weighted-NMS.)
- The Original NMS implemented by YOLACT is faster than ours, because they firstly use a score threshold (0.05) to get the set of candidate boxes, then do NMS will be faster (taking YOLACT ResNet101-FPN as example, 22 ~ 23 FPS with a slight performance drop). In order to get the same result with our Cluster-NMS, we modify the process of Original NMS.
- Note that Torchvision NMS has the fastest speed, that is owing to CUDA implementation and engineering accelerations (like upper triangular IoU matrix only). However, our Cluster-NMS requires less iterations for NMS and can also be further accelerated by adopting engineering tricks.
- Currently, Torchvision NMS use IoU as criterion, not DIoU. However, if we directly replace IoU with DIoU in Original NMS, it will costs much more time due to the sequence operation. Now, Cluster-DIoU-NMS will significantly speed up DIoU-NMS and obtain exactly the same result.
- Torchvision NMS is a function in Torchvision>=0.3, and our Cluster-NMS can be applied to any projects that use low version of Torchvision and other deep learning frameworks as long as it can do matrix operations. **No other import, no need to compile, less iteration, fully GPU-accelerated and better performance**.
## Images
```Shell
# Display qualitative results on the specified image.
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --ima
ge=my_image.png
# Process an image and save it to another file.
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --image=input_image.png:output_image.png
# Process a whole folder of images.
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --images=path/to/input/folder:path/to/output/folder
```
## Video
```Shell
# Display a video in real-time. "--video_multiframe" will process that many frames at once for improved performance.
# If you want, use "--display_fps" to draw the FPS directly on the frame.
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --video_multiframe=4 --video=my_video.mp4
# Display a webcam feed in real-time. If you have multiple webcams pass the index of the webcam you want instead of 0.
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --video_multiframe=4 --video=0
# Process a video and save it to another file. This uses the same pipeline as the ones above now, so it's fast!
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --video_multiframe=4 --video=input_video.mp4:output_video.mp4
```
As you can tell, `eval.py` can do a ton of stuff. Run the `--help` command to see everything it can do.
```Shell
python eval.py --help
```
# Training
By default, we train on COCO. Make sure to download the entire dataset using the commands above.
- To train, grab an imagenet-pretrained model and put it in `./weights`.
- For Resnet101, download `resnet101_reducedfc.pth` from [here](https://drive.google.com/file/d/1tvqFPd4bJtakOlmn-uIA492g2qurRChj/view?usp=sharing).
- For Resnet50, download `resnet50-19c8e357.pth` from [here](https://drive.google.com/file/d/1Jy3yCdbatgXa5YYIdTCRrSV0S9V5g1rn/view?usp=sharing).
- For Darknet53, download `darknet53.pth` from [here](https://drive.google.com/file/d/17Y431j4sagFpSReuPNoFcj9h7azDTZFf/view?usp=sharing).
- Run one of the training commands below.
- Note that you can press ctrl+c while training and it will save an `*_interrupt.pth` file at the current iteration.
- All weights are saved in the `./weights` directory by default with the file name `__.pth`.
```Shell
# Trains using the base config with a batch size of 8 (the default).
python train.py --config=yolact_base_config
# Trains yolact_base_config with a batch_size of 5. For the 550px models, 1 batch takes up around 1.5 gigs of VRAM, so specify accordingly.
python train.py --config=yolact_base_config --batch_size=5
# Resume training yolact_base with a specific weight file and start from the iteration specified in the weight file's name.
python train.py --config=yolact_base_config --resume=weights/yolact_base_10_32100.pth --start_iter=-1
# Use the help option to see a description of all available command line arguments
python train.py --help
```
## Multi-GPU Support
YOLACT now supports multiple GPUs seamlessly during training:
- Before running any of the scripts, run: `export CUDA_VISIBLE_DEVICES=[gpus]`
- Where you should replace [gpus] with a comma separated list of the index of each GPU you want to use (e.g., 0,1,2,3).
- You should still do this if only using 1 GPU.
- You can check the indices of your GPUs with `nvidia-smi`.
- Then, simply set the batch size to `8*num_gpus` with the training commands above. The training script will automatically scale the hyperparameters to the right values.
- If you have memory to spare you can increase the batch size further, but keep it a multiple of the number of GPUs you're using.
- If you want to allocate the images per GPU specific for different GPUs, you can use `--batch_alloc=[alloc]` where [alloc] is a comma seprated list containing the number of images on each GPU. This must sum to `batch_size`.
## Acknowledgments
Thank you to [Daniel Bolya](https://github.com/dbolya/) for his fork of [YOLACT & YOLACT++](https://github.com/dbolya/yolact), which is an exellent work for real-time instance segmentation.
================================================
FILE: README_zh-CN.md
================================================
### [English](README.md) | 简体中文
## Complete-IoU Loss and Cluster-NMS for Improving Object Detection and Instance Segmentation.
我们的论文已收录于**IEEE Transactions on Cybernetics (TCYB)**.
### 本代码基于YOLACT++.
这是我们论文的代码实现:
- [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https://arxiv.org/abs/1911.08287)
- [Enhancing Geometric Factors into Model Learning and Inference for Object Detection and Instance Segmentation](https://arxiv.org/abs/2005.03572)
```
@Inproceedings{zheng2020diou,
author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
title = {Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression},
booktitle = {The AAAI Conference on Artificial Intelligence (AAAI)},
year = {2020},
}
@Article{zheng2021ciou,
author = {Zheng, Zhaohui and Wang, Ping and Ren, Dongwei and Liu, Wei and Ye, Rongguang and Hu, Qinghua and Zuo, Wangmeng},
title = {Enhancing Geometric Factors in Model Learning and Inference for Object Detection and Instance Segmentation},
booktitle = {IEEE Transactions on Cybernetics},
year = {2021},
}
```
## Cluster-NMS的描述与使用
下面是Cluster-NMS算法的示意图,其中 X 表示IoU矩阵,它是由`X=jaccard(boxes,boxes).triu_(diagonal=1) > nms_thresh`计算得到的,当然`boxes`会事先用分类score降序排列。
NMS的输入是形状为[n,4]的`boxes`,以及形状为[80,n]的分类`scores`。(以coco为例)
NMS有两种途径,各有各的特点。
第一种是所有类别具有相同数量的框。首先,我们对每个类依照`scores`选取top k=200个框。于是`boxes`的形状变为[80,200,4]。执行Cluster-NMS主体程序并最后保留`scores>0.01`的框。最终返回前100个高分的框。
第二种是不同的类别具有不同数量的框。首先,我们一开始就用`scores`阈值(比如0.01)过滤掉了大多数的低分检测框。这一步导致了不同的类别可能剩下了不同数量的框。并且注意到同一个框可能出现很多次,因为其可能有多个类别的`score`都大于了0.01这个阈值。接着把所有的框放到一起并按照`score`降序排列。
接着我们给boxes添加偏移量,使用`torch.arange(0,80)`。这将导致不同类别的框不再相交,它们的IoU一定为0。这是因为(x1,y1,x2,y2)的坐标都在(0,1)区间内,因此我给每个框的坐标都加上它的类标签,就可以强行让不同类别的框分到不同的cluster中。例如某个框属于第61类,其坐标本来都介于(0,1)之间,加上了类标签偏移量后,它的坐标都将介于(60,61)这个区间。
最后执行Cluster-NMS主体程序,并返回前100个框。对于这种途径的Cluster-NMS方法,可参阅我们的另一代码库[SSD](https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/detection/detection.py)
## 指南
### 1) CIoU 与 Cluster-NMS
1. YOLACT (See [YOLACT](https://github.com/Zzh-tju/CIoU#YOLACT))
2. YOLOv3-pytorch [https://github.com/Zzh-tju/ultralytics-YOLOv3-Cluster-NMS](https://github.com/Zzh-tju/ultralytics-YOLOv3-Cluster-NMS)
3. YOLOv5 (支持批处理模式的Cluster-NMS。当使用测试阶段增强时,如多尺度测试,将大大加速NMS。) [https://github.com/Zzh-tju/yolov5](https://github.com/Zzh-tju/yolov5)
4. SSD-pytorch [https://github.com/Zzh-tju/DIoU-SSD-pytorch](https://github.com/Zzh-tju/DIoU-SSD-pytorch)
### 2) DIoU 与 CIoU 纳入检测器
1. YOLO v3 [https://github.com/Zzh-tju/DIoU-darknet](https://github.com/Zzh-tju/DIoU-darknet)
2. SSD [https://github.com/Zzh-tju/DIoU-SSD-pytorch](https://github.com/Zzh-tju/DIoU-SSD-pytorch)
3. Faster R-CNN [https://github.com/Zzh-tju/DIoU-pytorch-detectron](https://github.com/Zzh-tju/DIoU-pytorch-detectron)
4. 模拟实验 [https://github.com/Zzh-tju/DIoU](https://github.com/Zzh-tju/DIoU)
# YOLACT
### 代码位置与可选项
参见[layers/modules/multibox_loss.py](layers/modules/multibox_loss.py)中的`ciou` function,这是我们PyTorch实现的CIoU loss.
目前NMS支持两种模式:(见[eval.py](eval.py))
1. Cross-class模式,它将忽略类别,也就是所有类别混在一起处理。`cross_class_nms=True`,这将比per-class模式快一点,但性能会略微下降。
2. per-class模式,不同的类别分别NMS。(`cross_class_nms=False`)
目前,NMS支持这几种设置:`fast_nms`, `cluster_nms`, `cluster_diounms`, `spm`, `spm_dist`, `spm_dist_weighted`。
见[layers/functions/detection.py](layers/functions/detection.py),这是我们Cluster-NMS的PyTorch实现.
# 安装
要想使用YOLACT++, 确保你编译了DCNv2的代码.
- 下载本代码,并进入:
```Shell
git clone https://github.com/Zzh-tju/CIoU.git
cd yolact
```
- 使用以下的其中一个方法安装环境:
- 使用[Anaconda](https://www.anaconda.com/distribution/)
- 运行`conda env create -f environment.yml`
- 使用pip
- 创建Python3环境(例如使用virtenv).
- 安装[Pytorch](http://pytorch.org/) 1.0.1 (或更高版本)与TorchVision.
- 安装一些其它的包:
```Shell
# Cython需要在pycocotools之前安装
pip install cython
pip install opencv-python pillow pycocotools matplotlib
```
- 如要训练YOLACT, 下载COCO 2017数据集. 请注意,此脚本将花费一些时间,并将21G的文件转储到`./data/coco`.
```Shell
sh data/scripts/COCO.sh
```
- 如想在COCO `test-dev`上测试YOLACT, 需下载`test-dev`:
```Shell
sh data/scripts/COCO_test.sh
```
- 如需使用YOLACT++, 编译DCN layer ([DCNv2](https://github.com/CharlesShang/DCNv2/tree/pytorch_1.0)).
```Shell
cd external/DCNv2
python setup.py build develop
```
# 评估
以下是我们训练的YOLACT模型 (2020.5.5发布) 以及测试速度FPS,在单张GTX 1080 Ti上评估,所用测试集为`coco 2017 val`:
训练在双GPU上进行,使用如下命令:
`
python train.py --config=yolact_base_config --batch_size=8
`
| Image Size | Backbone | Loss | NMS | FPS | box AP | mask AP | Weights |
|:----:|:-------------:|:-------:|:----:|:----:|:----:|:----:|----------------------------------------------------------------------------------------------------------------------|
| 550 | Resnet101-FPN | SL1 | Fast NMS | 30.6 | 31.5 | 29.1 |[SL1.pth](https://share.weiyun.com/5N840Hm) |
| 550 | Resnet101-FPN | CIoU | Fast NMS | 30.6 | 32.1 | 29.6 | [CIoU.pth](https://share.weiyun.com/5EtJ4dJ) |
如要测试模型,请将权重放置在`./weights`目录下,并运行以下其中一个命令。
## COCO结果
```
# 在整个测试集上评估模型,这将输出一个COCOEval json文件,你可用于提交至COCO服务器(对于test-dev)或用run_coco_eval.py脚本直接评估(对于val 2017)。
# 以下命令将创建'./results/bbox_detections.json' 与 './results/mask_detections.json' ,分别对应目标检测与实例分割。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --output_coco_json
# 运行以下命令来评估刚刚生成的json文件。
python run_coco_eval.py
# 如想生成一个test-dev的COCO json文件,请确保你下载了test-dev数据集,然后运行
python eval.py --trained_model=weights/yolact_base_54_800000.pth --output_coco_json --dataset=coco2017_testdev_dataset
```
## COCO检测结果可视化
```
# 一般会使用0.15的分类score阈值。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --display
```
## Cluster-NMS 速度评估
```
python eval.py --trained_model=weights/yolact_base_54_800000.pth --benchmark
```
#### 设备
- 1 GTX 1080 Ti
- Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:------------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet101-FPN | CIoU | Fast NMS |**30.6**| 32.1 | 33.9 | 43.0 | 29.6 | 30.9 | 40.3 |
| 550 | Resnet101-FPN | CIoU | Original NMS | 11.5 | 32.5 | 34.1 | 45.1 | 29.7 | 31.0 | 41.7 |
| 550 | Resnet101-FPN | CIoU | Cluster-NMS | 28.8 | 32.5 | 34.1 | 45.2 | 29.7 | 31.0 | 41.7 |
| 550 | Resnet101-FPN | CIoU | SPM Cluster-NMS | 28.6 | 33.1 | 35.2 | 48.8 |**30.3**|**31.7**| 43.6 |
| 550 | Resnet101-FPN | CIoU | SPM + Distance Cluster-NMS | 27.1 | 33.2 | 35.2 |**49.2**| 30.2 |**31.7**|**43.8**|
| 550 | Resnet101-FPN | CIoU | SPM + Distance + Weighted Cluster-NMS | 26.5 |**33.4**|**35.5**| 49.1 |**30.3**| 31.6 |**43.8**|
以下是使用YOLACT官方的预训练权重评估。([yolact_resnet50_54_800000.pth](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/EUVpxoSXaqNIlssoLKOEoCcB1m0RpzGq_Khp5n1VX3zcUw))
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:-----------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet50-FPN | SL1 | Fast NMS |**41.6**| 30.2 | 31.9 | 42.0 | 28.0 | 29.1 | 39.4 |
| 550 | Resnet50-FPN | SL1 | Original NMS | 12.8 | 30.7 | 32.0 | 44.1 | 28.1 | 29.2 | 40.7 |
| 550 | Resnet50-FPN | SL1 | Cluster-NMS | 38.2 | 30.7 | 32.0 | 44.1 | 28.1 | 29.2 | 40.7 |
| 550 | Resnet50-FPN | SL1 | SPM Cluster-NMS | 37.7 | 31.3 | 33.2 | 48.0 |**28.8**|**29.9**| 42.8 |
| 550 | Resnet50-FPN | SL1 | SPM + Distance Cluster-NMS | 35.2 | 31.3 | 33.3 | 48.2 | 28.7 |**29.9**| 42.9 |
| 550 | Resnet50-FPN | SL1 | SPM + Distance + Weighted Cluster-NMS | 34.2 |**31.8**|**33.9**|**48.3**|**28.8**|**29.9**|**43.0**|
以下是使用YOLACT官方的预训练权重评估。([yolact_base_54_800000.pth](https://drive.google.com/file/d/1UYy3dMapbH1BnmtZU4WH1zbYgOzzHHf_/view?usp=sharing))
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:-----------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet101-FPN | SL1 | Fast NMS |**30.6**| 32.5 | 34.6 | 43.9 | 29.8 | 31.3 | 40.8 |
| 550 | Resnet101-FPN | SL1 | Original NMS | 11.9 | 32.9 | 34.8 | 45.8 | 29.9 | 31.4 | 42.1 |
| 550 | Resnet101-FPN | SL1 | Cluster-NMS | 29.2 | 32.9 | 34.8 | 45.9 | 29.9 | 31.4 | 42.1 |
| 550 | Resnet101-FPN | SL1 | SPM Cluster-NMS | 28.8 | 33.5 | 35.9 | 49.7 |**30.5**|**32.1**| 44.1 |
| 550 | Resnet101-FPN | SL1 | SPM + Distance Cluster-NMS | 27.5 | 33.5 | 35.9 |**50.2**| 30.4 | 32.0 |**44.3**|
| 550 | Resnet101-FPN | SL1 | SPM + Distance + Weighted Cluster-NMS | 26.7 |**34.0**|**36.6**| 49.9 |**30.5**| 32.0 |**44.3**|
以下是YOLACT++,同样是官方的预训练权重评估。([yolact_plus_base_54_800000.pth](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/EVQ62sF0SrJPrl_68onyHF8BpG7c05A8PavV4a849sZgEA))
| Image Size | Backbone | Loss | NMS | FPS | box AP | box AP75 | box AR100 | mask AP | mask AP75 | mask AR100 |
|:----:|:-------------:|:-------:|:-----------------------------------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
| 550 | Resnet101-FPN | SL1 | Fast NMS |**25.1**| 35.8 | 38.7 | 45.5 | 34.4 | 36.8 | 42.6 |
| 550 | Resnet101-FPN | SL1 | Original NMS | 10.9 | 36.4 | 39.1 | 48.0 | 34.7 | 37.1 | 44.1 |
| 550 | Resnet101-FPN | SL1 | Cluster-NMS | 23.7 | 36.4 | 39.1 | 48.0 | 34.7 | 37.1 | 44.1 |
| 550 | Resnet101-FPN | SL1 | SPM Cluster-NMS | 23.2 | 36.9 | 40.1 | 52.8 |**35.0**| 37.5 |**46.3**|
| 550 | Resnet101-FPN | SL1 | SPM + Distance Cluster-NMS | 22.0 | 36.9 | 40.2 |**53.0**| 34.9 | 37.5 |**46.3**|
| 550 | Resnet101-FPN | SL1 | SPM + Distance + Weighted Cluster-NMS | 21.7 |**37.4**|**40.6**| 52.5 |**35.0**|**37.6**|**46.3**|
#### 注:
- 我们还测试了SPM + Distance + Weighted Cluster-NMS策略,其中坐标加权仅针对`IoU> 0.8`的框,我们发现`IoU>0.5`对于YOLACT来说不够好,而`IoU>0.9`又几乎与`SPM + Distance Cluster-NMS`相当. (参见[CAD](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8265304)了解更多Weighted-NMS的细节。)
- YOLACT官方提供的Original NMS要比本代码的快,因为其预先使用了分类score阈值 (0.05)来过滤了大量的框,以YOLACT ResNet101-FPN为例,约22 ~ 23 FPS,但性能会有下降,为了确保相同的性能,我们不使用该预先分类score阈值。
- 注意到Torchvision NMS拥有最快的速度,这是由于CUDA编写与工程化加速(如只计算上三角IoU矩阵)。而我们的Cluster-NMS只需更少的迭代,并也可以进一步工程化加速。
- 目前Torchvision NMS使用IoU为评价准则,而不是DIoU。然而,如果我们直接在Original NMS中替换IoU为DIoU,得到DIoU-NMS,这将导致更大的计算开销。现在,Cluster-DIoU-NMS将大大加速DIoU-NMS,并保持与DIoU-NMS一样的精度。
- Torchvision NMS是Torchvision>=0.3中的函数,而我们的Cluster-NMS可以应用于任何低版本Torchvision的代码库或其他的深度学习框架,只要其可以进行矩阵运算。**无需其他import,无需编译,更少的迭代,完全GPU加速,以及更好的性能**。
## 可视化
```Shell
# 如下命令检测特定的图片。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --image=my_image.png
# 检测特定的图片并保存。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --image=input_image.png:output_image.png
# 检测一个目录中的所有图片。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --images=path/to/input/folder:path/to/output/folder
```
## 视频检测
```Shell
# 实时检测视频流,"--video_multiframe" 将一次性检测多帧以提高性能。
# 使用"--display_fps"将打印FPS在每一帧上。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --video_multiframe=4 --video=my_video.mp4
# 实时显示网络摄像头。如果你有多个网络摄像头,请传递所需网络摄像头的索引,而不是0。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --video_multiframe=4 --video=0
# 处理视频并保存。
python eval.py --trained_model=weights/yolact_base_54_800000.pth --score_threshold=0.15 --top_k=15 --video_multiframe=4 --video=input_video.mp4:output_video.mp4
```
正如你看到的,`eval.py`能做很多事情,添加`--help`命令查看它所能做的一切。
```Shell
python eval.py --help
```
# 训练
默认训练COCO。
- 准备ImageNet预训练模型并将其放置于`./weights`目录。
- [resnet101_reducedfc.pth](https://drive.google.com/file/d/1tvqFPd4bJtakOlmn-uIA492g2qurRChj/view?usp=sharing).
- [resnet50-19c8e357.pth](https://drive.google.com/file/d/1Jy3yCdbatgXa5YYIdTCRrSV0S9V5g1rn/view?usp=sharing).
- [darknet53.pth](https://drive.google.com/file/d/17Y431j4sagFpSReuPNoFcj9h7azDTZFf/view?usp=sharing).
- 运行以下其中一个训练命令。
- 训练过程中如按下ctrl+c将保存一个中断的权重`*_interrupt.pth`。
- 所有权重将保存于`./weights`,文件名为`__.pth`.
```Shell
# 默认batch size 8的训练,使用base config。
python train.py --config=yolact_base_config
# 训练yolact_base_config, batch_size为5. 对于550px模型,1 batch占用约1.5 gigs的VRAM,因此请相应指定。
python train.py --config=yolact_base_config --batch_size=5
# 从指定的权重接着训练,并于其文件名的轮数开始训。
python train.py --config=yolact_base_config --resume=weights/yolact_base_10_32100.pth --start_iter=-1
# 使用`--help`选项查看所有可用命令行参数的说明。
python train.py --help
```
## Multi-GPU
YOLACT可支持多GPU训练:
- 在运行任何脚本之前,你可以运行 `export CUDA_VISIBLE_DEVICES=[gpus]`
- 你可以使用一个序号列表来替换[gpus],例如0,1,2,3。
- 只要一张卡也可以这样做。
- 用`nvidia-smi`命令可查看可用GPU的序号。
- 接着,简单设置batch size为`8*num_gpus`. 训练脚本将自动将超参数缩放到正确的值。
- 如果你显存够用,你可以继续增大batch size,但要将其保持为正在使用的GPU数量的倍数。
- 如果要为不同的GPU分配不同数量的图片,使用 `--batch_alloc=[alloc]`,其中[alloc]是一个逗号分隔列表,包含每个GPU上的图像数。该值的总和必须为`batch_size`。
## 致谢
感谢[Daniel Bolya](https://github.com/dbolya/)的[YOLACT & YOLACT++](https://github.com/dbolya/yolact), 这是一项实时实例分割的杰出工作。
================================================
FILE: backbone.py
================================================
import torch
import torch.nn as nn
import pickle
from collections import OrderedDict
try:
from dcn_v2 import DCN
except ImportError:
def DCN(*args, **kwdargs):
raise Exception('DCN could not be imported. If you want to use YOLACT++ models, compile DCN. Check the README for instructions.')
class Bottleneck(nn.Module):
""" Adapted from torchvision.models.resnet """
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, dilation=dilation)
self.bn1 = norm_layer(planes)
if use_dcn:
self.conv2 = DCN(planes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, deformable_groups=1)
self.conv2.bias.data.zero_()
self.conv2.conv_offset_mask.weight.data.zero_()
self.conv2.conv_offset_mask.bias.data.zero_()
else:
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=dilation, bias=False, dilation=dilation)
self.bn2 = norm_layer(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False, dilation=dilation)
self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNetBackbone(nn.Module):
""" Adapted from torchvision.models.resnet """
def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d):
super().__init__()
# These will be populated by _make_layer
self.num_base_layers = len(layers)
self.layers = nn.ModuleList()
self.channels = []
self.norm_layer = norm_layer
self.dilation = 1
self.atrous_layers = atrous_layers
# From torchvision.models.resnet.Resnet
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval)
self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval)
self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval)
self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval)
# This contains every module that should be initialized by loading in pretrained weights.
# Any extra layers added onto this that won't be initialized by init_backbone will not be
# in this list. That way, Yolact::init_weights knows which backbone weights to initialize
# with xavier, and which ones to leave alone.
self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1):
""" Here one layer means a string of n Bottleneck blocks. """
downsample = None
# This is actually just to create the connection between layers, and not necessarily to
# downsample. Even if the second condition is met, it only downsamples when stride != 1
if stride != 1 or self.inplanes != planes * block.expansion:
if len(self.layers) in self.atrous_layers:
self.dilation += 1
stride = 1
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False,
dilation=self.dilation),
self.norm_layer(planes * block.expansion),
)
layers = []
use_dcn = (dcn_layers >= blocks)
layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation, use_dcn=use_dcn))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
use_dcn = ((i+dcn_layers) >= blocks) and (i % dcn_interval == 0)
layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn))
layer = nn.Sequential(*layers)
self.channels.append(planes * block.expansion)
self.layers.append(layer)
return layer
def forward(self, x):
""" Returns a list of convouts for each layer. """
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
outs = []
for layer in self.layers:
x = layer(x)
outs.append(x)
return tuple(outs)
def init_backbone(self, path):
""" Initializes the backbone weights for training. """
state_dict = torch.load(path)
# Replace layer1 -> layers.0 etc.
keys = list(state_dict)
for key in keys:
if key.startswith('layer'):
idx = int(key[5])
new_key = 'layers.' + str(idx-1) + key[6:]
state_dict[new_key] = state_dict.pop(key)
# Note: Using strict=False is berry scary. Triple check this.
self.load_state_dict(state_dict, strict=False)
def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=Bottleneck):
""" Add a downsample layer to the backbone as per what SSD does. """
self._make_layer(block, conv_channels // block.expansion, blocks=depth, stride=downsample)
class ResNetBackboneGN(ResNetBackbone):
def __init__(self, layers, num_groups=32):
super().__init__(layers, norm_layer=lambda x: nn.GroupNorm(num_groups, x))
def init_backbone(self, path):
""" The path here comes from detectron. So we load it differently. """
with open(path, 'rb') as f:
state_dict = pickle.load(f, encoding='latin1') # From the detectron source
state_dict = state_dict['blobs']
our_state_dict_keys = list(self.state_dict().keys())
new_state_dict = {}
gn_trans = lambda x: ('gn_s' if x == 'weight' else 'gn_b')
layeridx2res = lambda x: 'res' + str(int(x)+2)
block2branch = lambda x: 'branch2' + ('a', 'b', 'c')[int(x[-1:])-1]
# Transcribe each Detectron weights name to a Yolact weights name
for key in our_state_dict_keys:
parts = key.split('.')
transcribed_key = ''
if (parts[0] == 'conv1'):
transcribed_key = 'conv1_w'
elif (parts[0] == 'bn1'):
transcribed_key = 'conv1_' + gn_trans(parts[1])
elif (parts[0] == 'layers'):
if int(parts[1]) >= self.num_base_layers: continue
transcribed_key = layeridx2res(parts[1])
transcribed_key += '_' + parts[2] + '_'
if parts[3] == 'downsample':
transcribed_key += 'branch1_'
if parts[4] == '0':
transcribed_key += 'w'
else:
transcribed_key += gn_trans(parts[5])
else:
transcribed_key += block2branch(parts[3]) + '_'
if 'conv' in parts[3]:
transcribed_key += 'w'
else:
transcribed_key += gn_trans(parts[4])
new_state_dict[key] = torch.Tensor(state_dict[transcribed_key])
# strict=False because we may have extra unitialized layers at this point
self.load_state_dict(new_state_dict, strict=False)
def darknetconvlayer(in_channels, out_channels, *args, **kwdargs):
"""
Implements a conv, activation, then batch norm.
Arguments are passed into the conv layer.
"""
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, *args, **kwdargs, bias=False),
nn.BatchNorm2d(out_channels),
# Darknet uses 0.1 here.
# See https://github.com/pjreddie/darknet/blob/680d3bde1924c8ee2d1c1dea54d3e56a05ca9a26/src/activations.h#L39
nn.LeakyReLU(0.1, inplace=True)
)
class DarkNetBlock(nn.Module):
""" Note: channels is the lesser of the two. The output will be expansion * channels. """
expansion = 2
def __init__(self, in_channels, channels):
super().__init__()
self.conv1 = darknetconvlayer(in_channels, channels, kernel_size=1)
self.conv2 = darknetconvlayer(channels, channels * self.expansion, kernel_size=3, padding=1)
def forward(self, x):
return self.conv2(self.conv1(x)) + x
class DarkNetBackbone(nn.Module):
"""
An implementation of YOLOv3's Darnet53 in
https://pjreddie.com/media/files/papers/YOLOv3.pdf
This is based off of the implementation of Resnet above.
"""
def __init__(self, layers=[1, 2, 8, 8, 4], block=DarkNetBlock):
super().__init__()
# These will be populated by _make_layer
self.num_base_layers = len(layers)
self.layers = nn.ModuleList()
self.channels = []
self._preconv = darknetconvlayer(3, 32, kernel_size=3, padding=1)
self.in_channels = 32
self._make_layer(block, 32, layers[0])
self._make_layer(block, 64, layers[1])
self._make_layer(block, 128, layers[2])
self._make_layer(block, 256, layers[3])
self._make_layer(block, 512, layers[4])
# This contains every module that should be initialized by loading in pretrained weights.
# Any extra layers added onto this that won't be initialized by init_backbone will not be
# in this list. That way, Yolact::init_weights knows which backbone weights to initialize
# with xavier, and which ones to leave alone.
self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
def _make_layer(self, block, channels, num_blocks, stride=2):
""" Here one layer means a string of n blocks. """
layer_list = []
# The downsample layer
layer_list.append(
darknetconvlayer(self.in_channels, channels * block.expansion,
kernel_size=3, padding=1, stride=stride))
# Each block inputs channels and outputs channels * expansion
self.in_channels = channels * block.expansion
layer_list += [block(self.in_channels, channels) for _ in range(num_blocks)]
self.channels.append(self.in_channels)
self.layers.append(nn.Sequential(*layer_list))
def forward(self, x):
""" Returns a list of convouts for each layer. """
x = self._preconv(x)
outs = []
for layer in self.layers:
x = layer(x)
outs.append(x)
return tuple(outs)
def add_layer(self, conv_channels=1024, stride=2, depth=1, block=DarkNetBlock):
""" Add a downsample layer to the backbone as per what SSD does. """
self._make_layer(block, conv_channels // block.expansion, num_blocks=depth, stride=stride)
def init_backbone(self, path):
""" Initializes the backbone weights for training. """
# Note: Using strict=False is berry scary. Triple check this.
self.load_state_dict(torch.load(path), strict=False)
class VGGBackbone(nn.Module):
"""
Args:
- cfg: A list of layers given as lists. Layers can be either 'M' signifying
a max pooling layer, a number signifying that many feature maps in
a conv layer, or a tuple of 'M' or a number and a kwdargs dict to pass
into the function that creates the layer (e.g. nn.MaxPool2d for 'M').
- extra_args: A list of lists of arguments to pass into add_layer.
- norm_layers: Layers indices that need to pass through an l2norm layer.
"""
def __init__(self, cfg, extra_args=[], norm_layers=[]):
super().__init__()
self.channels = []
self.layers = nn.ModuleList()
self.in_channels = 3
self.extra_args = list(reversed(extra_args)) # So I can use it as a stack
# Keeps track of what the corresponding key will be in the state dict of the
# pretrained model. For instance, layers.0.2 for us is 2 for the pretrained
# model but layers.1.1 is 5.
self.total_layer_count = 0
self.state_dict_lookup = {}
for idx, layer_cfg in enumerate(cfg):
self._make_layer(layer_cfg)
self.norms = nn.ModuleList([nn.BatchNorm2d(self.channels[l]) for l in norm_layers])
self.norm_lookup = {l: idx for idx, l in enumerate(norm_layers)}
# These modules will be initialized by init_backbone,
# so don't overwrite their initialization later.
self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
def _make_layer(self, cfg):
"""
Each layer is a sequence of conv layers usually preceded by a max pooling.
Adapted from torchvision.models.vgg.make_layers.
"""
layers = []
for v in cfg:
# VGG in SSD requires some special layers, so allow layers to be tuples of
# (, kwdargs dict)
args = None
if isinstance(v, tuple):
args = v[1]
v = v[0]
# v should be either M or a number
if v == 'M':
# Set default arguments
if args is None:
args = {'kernel_size': 2, 'stride': 2}
layers.append(nn.MaxPool2d(**args))
else:
# See the comment in __init__ for an explanation of this
cur_layer_idx = self.total_layer_count + len(layers)
self.state_dict_lookup[cur_layer_idx] = '%d.%d' % (len(self.layers), len(layers))
# Set default arguments
if args is None:
args = {'kernel_size': 3, 'padding': 1}
# Add the layers
layers.append(nn.Conv2d(self.in_channels, v, **args))
layers.append(nn.ReLU(inplace=True))
self.in_channels = v
self.total_layer_count += len(layers)
self.channels.append(self.in_channels)
self.layers.append(nn.Sequential(*layers))
def forward(self, x):
""" Returns a list of convouts for each layer. """
outs = []
for idx, layer in enumerate(self.layers):
x = layer(x)
# Apply an l2norm module to the selected layers
# Note that this differs from the original implemenetation
if idx in self.norm_lookup:
x = self.norms[self.norm_lookup[idx]](x)
outs.append(x)
return tuple(outs)
def transform_key(self, k):
""" Transform e.g. features.24.bias to layers.4.1.bias """
vals = k.split('.')
layerIdx = self.state_dict_lookup[int(vals[0])]
return 'layers.%s.%s' % (layerIdx, vals[1])
def init_backbone(self, path):
""" Initializes the backbone weights for training. """
state_dict = torch.load(path)
state_dict = OrderedDict([(self.transform_key(k), v) for k,v in state_dict.items()])
self.load_state_dict(state_dict, strict=False)
def add_layer(self, conv_channels=128, downsample=2):
""" Add a downsample layer to the backbone as per what SSD does. """
if len(self.extra_args) > 0:
conv_channels, downsample = self.extra_args.pop()
padding = 1 if downsample > 1 else 0
layer = nn.Sequential(
nn.Conv2d(self.in_channels, conv_channels, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv2d(conv_channels, conv_channels*2, kernel_size=3, stride=downsample, padding=padding),
nn.ReLU(inplace=True)
)
self.in_channels = conv_channels*2
self.channels.append(self.in_channels)
self.layers.append(layer)
def construct_backbone(cfg):
""" Constructs a backbone given a backbone config object (see config.py). """
backbone = cfg.type(*cfg.args)
# Add downsampling layers until we reach the number we need
num_layers = max(cfg.selected_layers) + 1
while len(backbone.layers) < num_layers:
backbone.add_layer()
return backbone
================================================
FILE: data/__init__.py
================================================
from .config import *
from .coco import *
import torch
import cv2
import numpy as np
================================================
FILE: data/coco.py
================================================
import os
import os.path as osp
import sys
import torch
import torch.utils.data as data
import torch.nn.functional as F
import cv2
import numpy as np
from .config import cfg
from pycocotools import mask as maskUtils
import random
def get_label_map():
if cfg.dataset.label_map is None:
return {x+1: x+1 for x in range(len(cfg.dataset.class_names))}
else:
return cfg.dataset.label_map
class COCOAnnotationTransform(object):
"""Transforms a COCO annotation into a Tensor of bbox coords and label index
Initilized with a dictionary lookup of classnames to indexes
"""
def __init__(self):
self.label_map = get_label_map()
def __call__(self, target, width, height):
"""
Args:
target (dict): COCO target json annotation as a python dict
height (int): height
width (int): width
Returns:
a list containing lists of bounding boxes [bbox coords, class idx]
"""
scale = np.array([width, height, width, height])
res = []
for obj in target:
if 'bbox' in obj:
bbox = obj['bbox']
label_idx = obj['category_id']
if label_idx >= 0:
label_idx = self.label_map[label_idx] - 1
final_box = list(np.array([bbox[0], bbox[1], bbox[0]+bbox[2], bbox[1]+bbox[3]])/scale)
final_box.append(label_idx)
res += [final_box] # [xmin, ymin, xmax, ymax, label_idx]
else:
print("No bbox found for object ", obj)
return res
class COCODetection(data.Dataset):
"""`MS Coco Detection `_ Dataset.
Args:
root (string): Root directory where images are downloaded to.
set_name (string): Name of the specific set of COCO images.
transform (callable, optional): A function/transform that augments the
raw images`
target_transform (callable, optional): A function/transform that takes
in the target (bbox) and transforms it.
prep_crowds (bool): Whether or not to prepare crowds for the evaluation step.
"""
def __init__(self, image_path, info_file, transform=None,
target_transform=None,
dataset_name='MS COCO', has_gt=True):
# Do this here because we have too many things named COCO
from pycocotools.coco import COCO
if target_transform is None:
target_transform = COCOAnnotationTransform()
self.root = image_path
self.coco = COCO(info_file)
self.ids = list(self.coco.imgToAnns.keys())
if len(self.ids) == 0 or not has_gt:
self.ids = list(self.coco.imgs.keys())
self.transform = transform
self.target_transform = COCOAnnotationTransform()
self.name = dataset_name
self.has_gt = has_gt
def __getitem__(self, index):
"""
Args:
index (int): Index
Returns:
tuple: Tuple (image, (target, masks, num_crowds)).
target is the object returned by ``coco.loadAnns``.
"""
im, gt, masks, h, w, num_crowds = self.pull_item(index)
return im, (gt, masks, num_crowds)
def __len__(self):
return len(self.ids)
def pull_item(self, index):
"""
Args:
index (int): Index
Returns:
tuple: Tuple (image, target, masks, height, width, crowd).
target is the object returned by ``coco.loadAnns``.
Note that if no crowd annotations exist, crowd will be None
"""
img_id = self.ids[index]
if self.has_gt:
ann_ids = self.coco.getAnnIds(imgIds=img_id)
# Target has {'segmentation', 'area', iscrowd', 'image_id', 'bbox', 'category_id'}
target = [x for x in self.coco.loadAnns(ann_ids) if x['image_id'] == img_id]
else:
target = []
# Separate out crowd annotations. These are annotations that signify a large crowd of
# objects of said class, where there is no annotation for each individual object. Both
# during testing and training, consider these crowds as neutral.
crowd = [x for x in target if ('iscrowd' in x and x['iscrowd'])]
target = [x for x in target if not ('iscrowd' in x and x['iscrowd'])]
num_crowds = len(crowd)
for x in crowd:
x['category_id'] = -1
# This is so we ensure that all crowd annotations are at the end of the array
target += crowd
# The split here is to have compatibility with both COCO2014 and 2017 annotations.
# In 2014, images have the pattern COCO_{train/val}2014_%012d.jpg, while in 2017 it's %012d.jpg.
# Our script downloads the images as %012d.jpg so convert accordingly.
file_name = self.coco.loadImgs(img_id)[0]['file_name']
if file_name.startswith('COCO'):
file_name = file_name.split('_')[-1]
path = osp.join(self.root, file_name)
assert osp.exists(path), 'Image path does not exist: {}'.format(path)
img = cv2.imread(path)
height, width, _ = img.shape
if len(target) > 0:
# Pool all the masks for this image into one [num_objects,height,width] matrix
masks = [self.coco.annToMask(obj).reshape(-1) for obj in target]
masks = np.vstack(masks)
masks = masks.reshape(-1, height, width)
if self.target_transform is not None and len(target) > 0:
target = self.target_transform(target, width, height)
if self.transform is not None:
if len(target) > 0:
target = np.array(target)
img, masks, boxes, labels = self.transform(img, masks, target[:, :4],
{'num_crowds': num_crowds, 'labels': target[:, 4]})
# I stored num_crowds in labels so I didn't have to modify the entirety of augmentations
num_crowds = labels['num_crowds']
labels = labels['labels']
target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
else:
img, _, _, _ = self.transform(img, np.zeros((1, height, width), dtype=np.float), np.array([[0, 0, 1, 1]]),
{'num_crowds': 0, 'labels': np.array([0])})
masks = None
target = None
if target.shape[0] == 0:
print('Warning: Augmentation output an example with no ground truth. Resampling...')
return self.pull_item(random.randint(0, len(self.ids)-1))
return torch.from_numpy(img).permute(2, 0, 1), target, masks, height, width, num_crowds
def pull_image(self, index):
'''Returns the original image object at index in PIL form
Note: not using self.__getitem__(), as any transformations passed in
could mess up this functionality.
Argument:
index (int): index of img to show
Return:
cv2 img
'''
img_id = self.ids[index]
path = self.coco.loadImgs(img_id)[0]['file_name']
return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR)
def pull_anno(self, index):
'''Returns the original annotation of image at index
Note: not using self.__getitem__(), as any transformations passed in
could mess up this functionality.
Argument:
index (int): index of img to get annotation of
Return:
list: [img_id, [(label, bbox coords),...]]
eg: ('001718', [('dog', (96, 13, 438, 332))])
'''
img_id = self.ids[index]
ann_ids = self.coco.getAnnIds(imgIds=img_id)
return self.coco.loadAnns(ann_ids)
def __repr__(self):
fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
fmt_str += ' Number of datapoints: {}\n'.format(self.__len__())
fmt_str += ' Root Location: {}\n'.format(self.root)
tmp = ' Transforms (if any): '
fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
tmp = ' Target Transforms (if any): '
fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
return fmt_str
def enforce_size(img, targets, masks, num_crowds, new_w, new_h):
""" Ensures that the image is the given size without distorting aspect ratio. """
with torch.no_grad():
_, h, w = img.size()
if h == new_h and w == new_w:
return img, targets, masks, num_crowds
# Resize the image so that it fits within new_w, new_h
w_prime = new_w
h_prime = h * new_w / w
if h_prime > new_h:
w_prime *= new_h / h_prime
h_prime = new_h
w_prime = int(w_prime)
h_prime = int(h_prime)
# Do all the resizing
img = F.interpolate(img.unsqueeze(0), (h_prime, w_prime), mode='bilinear', align_corners=False)
img.squeeze_(0)
# Act like each object is a color channel
masks = F.interpolate(masks.unsqueeze(0), (h_prime, w_prime), mode='bilinear', align_corners=False)
masks.squeeze_(0)
# Scale bounding boxes (this will put them in the top left corner in the case of padding)
targets[:, [0, 2]] *= (w_prime / new_w)
targets[:, [1, 3]] *= (h_prime / new_h)
# Finally, pad everything to be the new_w, new_h
pad_dims = (0, new_w - w_prime, 0, new_h - h_prime)
img = F.pad( img, pad_dims, mode='constant', value=0)
masks = F.pad(masks, pad_dims, mode='constant', value=0)
return img, targets, masks, num_crowds
def detection_collate(batch):
"""Custom collate fn for dealing with batches of images that have a different
number of associated object annotations (bounding boxes).
Arguments:
batch: (tuple) A tuple of tensor images and (lists of annotations, masks)
Return:
A tuple containing:
1) (tensor) batch of images stacked on their 0 dim
2) (list, list, list) annotations for a given image are stacked
on 0 dim. The output gt is a tuple of annotations and masks.
"""
targets = []
imgs = []
masks = []
num_crowds = []
for sample in batch:
imgs.append(sample[0])
targets.append(torch.FloatTensor(sample[1][0]))
masks.append(torch.FloatTensor(sample[1][1]))
num_crowds.append(sample[1][2])
return imgs, (targets, masks, num_crowds)
================================================
FILE: data/config.py
================================================
from backbone import ResNetBackbone, VGGBackbone, ResNetBackboneGN, DarkNetBackbone
from math import sqrt
import torch
# for making bounding boxes pretty
COLORS = ((244, 67, 54),
(233, 30, 99),
(156, 39, 176),
(103, 58, 183),
( 63, 81, 181),
( 33, 150, 243),
( 3, 169, 244),
( 0, 188, 212),
( 0, 150, 136),
( 76, 175, 80),
(139, 195, 74),
(205, 220, 57),
(255, 235, 59),
(255, 193, 7),
(255, 152, 0),
(255, 87, 34),
(121, 85, 72),
(158, 158, 158),
( 96, 125, 139))
# These are in BGR and are for ImageNet
MEANS = (103.94, 116.78, 123.68)
STD = (57.38, 57.12, 58.40)
COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
'scissors', 'teddy bear', 'hair drier', 'toothbrush')
COCO_LABEL_MAP = { 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8,
9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16,
18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24,
27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32,
37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40,
46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48,
54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56,
62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64,
74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72,
82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80}
# ----------------------- CONFIG CLASS ----------------------- #
class Config(object):
"""
Holds the configuration for anything you want it to.
To get the currently active config, call get_cfg().
To use, just do cfg.x instead of cfg['x'].
I made this because doing cfg['x'] all the time is dumb.
"""
def __init__(self, config_dict):
for key, val in config_dict.items():
self.__setattr__(key, val)
def copy(self, new_config_dict={}):
"""
Copies this config into a new config object, making
the changes given by new_config_dict.
"""
ret = Config(vars(self))
for key, val in new_config_dict.items():
ret.__setattr__(key, val)
return ret
def replace(self, new_config_dict):
"""
Copies new_config_dict into this config object.
Note: new_config_dict can also be a config object.
"""
if isinstance(new_config_dict, Config):
new_config_dict = vars(new_config_dict)
for key, val in new_config_dict.items():
self.__setattr__(key, val)
def print(self):
for k, v in vars(self).items():
print(k, ' = ', v)
# ----------------------- DATASETS ----------------------- #
dataset_base = Config({
'name': 'Base Dataset',
# Training images and annotations
'train_images': './data/coco/images/',
'train_info': 'path_to_annotation_file',
# Validation images and annotations.
'valid_images': './data/coco/images/',
'valid_info': 'path_to_annotation_file',
# Whether or not to load GT. If this is False, eval.py quantitative evaluation won't work.
'has_gt': True,
# A list of names for each of you classes.
'class_names': COCO_CLASSES,
# COCO class ids aren't sequential, so this is a bandage fix. If your ids aren't sequential,
# provide a map from category_id -> index in class_names + 1 (the +1 is there because it's 1-indexed).
# If not specified, this just assumes category ids start at 1 and increase sequentially.
'label_map': None
})
coco2014_dataset = dataset_base.copy({
'name': 'COCO 2014',
'train_info': './data/coco/annotations/instances_train2014.json',
'valid_info': './data/coco/annotations/instances_val2014.json',
'label_map': COCO_LABEL_MAP
})
coco2017_dataset = dataset_base.copy({
'name': 'COCO 2017',
'train_info': './data/coco/annotations/instances_train2017.json',
'valid_info': './data/coco/annotations/instances_val2017.json',
'label_map': COCO_LABEL_MAP
})
coco2017_testdev_dataset = dataset_base.copy({
'name': 'COCO 2017 Test-Dev',
'valid_info': './data/coco/annotations/image_info_test-dev2017.json',
'has_gt': False,
'label_map': COCO_LABEL_MAP
})
PASCAL_CLASSES = ("aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person", "pottedplant",
"sheep", "sofa", "train", "tvmonitor")
pascal_sbd_dataset = dataset_base.copy({
'name': 'Pascal SBD 2012',
'train_images': './data/sbd/img',
'valid_images': './data/sbd/img',
'train_info': './data/sbd/pascal_sbd_train.json',
'valid_info': './data/sbd/pascal_sbd_val.json',
'class_names': PASCAL_CLASSES,
})
# ----------------------- TRANSFORMS ----------------------- #
resnet_transform = Config({
'channel_order': 'RGB',
'normalize': True,
'subtract_means': False,
'to_float': False,
})
vgg_transform = Config({
# Note that though vgg is traditionally BGR,
# the channel order of vgg_reducedfc.pth is RGB.
'channel_order': 'RGB',
'normalize': False,
'subtract_means': True,
'to_float': False,
})
darknet_transform = Config({
'channel_order': 'RGB',
'normalize': False,
'subtract_means': False,
'to_float': True,
})
# ----------------------- BACKBONES ----------------------- #
backbone_base = Config({
'name': 'Base Backbone',
'path': 'path/to/pretrained/weights',
'type': object,
'args': tuple(),
'transform': resnet_transform,
'selected_layers': list(),
'pred_scales': list(),
'pred_aspect_ratios': list(),
'use_pixel_scales': False,
'preapply_sqrt': True,
'use_square_anchors': False,
})
resnet101_backbone = backbone_base.copy({
'name': 'ResNet101',
'path': 'resnet101_reducedfc.pth',
'type': ResNetBackbone,
'args': ([3, 4, 23, 3],),
'transform': resnet_transform,
'selected_layers': list(range(2, 8)),
'pred_scales': [[1]]*6,
'pred_aspect_ratios': [ [[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]] ] * 6,
})
resnet101_gn_backbone = backbone_base.copy({
'name': 'ResNet101_GN',
'path': 'R-101-GN.pkl',
'type': ResNetBackboneGN,
'args': ([3, 4, 23, 3],),
'transform': resnet_transform,
'selected_layers': list(range(2, 8)),
'pred_scales': [[1]]*6,
'pred_aspect_ratios': [ [[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]] ] * 6,
})
resnet101_dcn_inter3_backbone = resnet101_backbone.copy({
'name': 'ResNet101_DCN_Interval3',
'args': ([3, 4, 23, 3], [0, 4, 23, 3], 3),
})
resnet50_backbone = resnet101_backbone.copy({
'name': 'ResNet50',
'path': 'resnet50-19c8e357.pth',
'type': ResNetBackbone,
'args': ([3, 4, 6, 3],),
'transform': resnet_transform,
})
resnet50_dcnv2_backbone = resnet50_backbone.copy({
'name': 'ResNet50_DCNv2',
'args': ([3, 4, 6, 3], [0, 4, 6, 3]),
})
darknet53_backbone = backbone_base.copy({
'name': 'DarkNet53',
'path': 'darknet53.pth',
'type': DarkNetBackbone,
'args': ([1, 2, 8, 8, 4],),
'transform': darknet_transform,
'selected_layers': list(range(3, 9)),
'pred_scales': [[3.5, 4.95], [3.6, 4.90], [3.3, 4.02], [2.7, 3.10], [2.1, 2.37], [1.8, 1.92]],
'pred_aspect_ratios': [ [[1, sqrt(2), 1/sqrt(2), sqrt(3), 1/sqrt(3)][:n], [1]] for n in [3, 5, 5, 5, 3, 3] ],
})
vgg16_arch = [[64, 64],
[ 'M', 128, 128],
[ 'M', 256, 256, 256],
[('M', {'kernel_size': 2, 'stride': 2, 'ceil_mode': True}), 512, 512, 512],
[ 'M', 512, 512, 512],
[('M', {'kernel_size': 3, 'stride': 1, 'padding': 1}),
(1024, {'kernel_size': 3, 'padding': 6, 'dilation': 6}),
(1024, {'kernel_size': 1})]]
vgg16_backbone = backbone_base.copy({
'name': 'VGG16',
'path': 'vgg16_reducedfc.pth',
'type': VGGBackbone,
'args': (vgg16_arch, [(256, 2), (128, 2), (128, 1), (128, 1)], [3]),
'transform': vgg_transform,
'selected_layers': [3] + list(range(5, 10)),
'pred_scales': [[5, 4]]*6,
'pred_aspect_ratios': [ [[1], [1, sqrt(2), 1/sqrt(2), sqrt(3), 1/sqrt(3)][:n]] for n in [3, 5, 5, 5, 3, 3] ],
})
# ----------------------- MASK BRANCH TYPES ----------------------- #
mask_type = Config({
# Direct produces masks directly as the output of each pred module.
# This is denoted as fc-mask in the paper.
# Parameters: mask_size, use_gt_bboxes
'direct': 0,
# Lincomb produces coefficients as the output of each pred module then uses those coefficients
# to linearly combine features from a prototype network to create image-sized masks.
# Parameters:
# - masks_to_train (int): Since we're producing (near) full image masks, it'd take too much
# vram to backprop on every single mask. Thus we select only a subset.
# - mask_proto_src (int): The input layer to the mask prototype generation network. This is an
# index in backbone.layers. Use to use the image itself instead.
# - mask_proto_net (list): A list of layers in the mask proto network with the last one
# being where the masks are taken from. Each conv layer is in
# the form (num_features, kernel_size, **kwdargs). An empty
# list means to use the source for prototype masks. If the
# kernel_size is negative, this creates a deconv layer instead.
# If the kernel_size is negative and the num_features is None,
# this creates a simple bilinear interpolation layer instead.
# - mask_proto_bias (bool): Whether to include an extra coefficient that corresponds to a proto
# mask of all ones.
# - mask_proto_prototype_activation (func): The activation to apply to each prototype mask.
# - mask_proto_mask_activation (func): After summing the prototype masks with the predicted
# coeffs, what activation to apply to the final mask.
# - mask_proto_coeff_activation (func): The activation to apply to the mask coefficients.
# - mask_proto_crop (bool): If True, crop the mask with the predicted bbox during training.
# - mask_proto_crop_expand (float): If cropping, the percent to expand the cropping bbox by
# in each direction. This is to make the model less reliant
# on perfect bbox predictions.
# - mask_proto_loss (str [l1|disj]): If not None, apply an l1 or disjunctive regularization
# loss directly to the prototype masks.
# - mask_proto_binarize_downsampled_gt (bool): Binarize GT after dowsnampling during training?
# - mask_proto_normalize_mask_loss_by_sqrt_area (bool): Whether to normalize mask loss by sqrt(sum(gt))
# - mask_proto_reweight_mask_loss (bool): Reweight mask loss such that background is divided by
# #background and foreground is divided by #foreground.
# - mask_proto_grid_file (str): The path to the grid file to use with the next option.
# This should be a numpy.dump file with shape [numgrids, h, w]
# where h and w are w.r.t. the mask_proto_src convout.
# - mask_proto_use_grid (bool): Whether to add extra grid features to the proto_net input.
# - mask_proto_coeff_gate (bool): Add an extra set of sigmoided coefficients that is multiplied
# into the predicted coefficients in order to "gate" them.
# - mask_proto_prototypes_as_features (bool): For each prediction module, downsample the prototypes
# to the convout size of that module and supply the prototypes as input
# in addition to the already supplied backbone features.
# - mask_proto_prototypes_as_features_no_grad (bool): If the above is set, don't backprop gradients to
# to the prototypes from the network head.
# - mask_proto_remove_empty_masks (bool): Remove masks that are downsampled to 0 during loss calculations.
# - mask_proto_reweight_coeff (float): The coefficient to multiple the forground pixels with if reweighting.
# - mask_proto_coeff_diversity_loss (bool): Apply coefficient diversity loss on the coefficients so that the same
# instance has similar coefficients.
# - mask_proto_coeff_diversity_alpha (float): The weight to use for the coefficient diversity loss.
# - mask_proto_normalize_emulate_roi_pooling (bool): Normalize the mask loss to emulate roi pooling's affect on loss.
# - mask_proto_double_loss (bool): Whether to use the old loss in addition to any special new losses.
# - mask_proto_double_loss_alpha (float): The alpha to weight the above loss.
# - mask_proto_split_prototypes_by_head (bool): If true, this will give each prediction head its own prototypes.
# - mask_proto_crop_with_pred_box (bool): Whether to crop with the predicted box or the gt box.
'lincomb': 1,
})
# ----------------------- ACTIVATION FUNCTIONS ----------------------- #
activation_func = Config({
'tanh': torch.tanh,
'sigmoid': torch.sigmoid,
'softmax': lambda x: torch.nn.functional.softmax(x, dim=-1),
'relu': lambda x: torch.nn.functional.relu(x, inplace=True),
'none': lambda x: x,
})
# ----------------------- FPN DEFAULTS ----------------------- #
fpn_base = Config({
# The number of features to have in each FPN layer
'num_features': 256,
# The upsampling mode used
'interpolation_mode': 'bilinear',
# The number of extra layers to be produced by downsampling starting at P5
'num_downsample': 1,
# Whether to down sample with a 3x3 stride 2 conv layer instead of just a stride 2 selection
'use_conv_downsample': False,
# Whether to pad the pred layers with 1 on each side (I forgot to add this at the start)
# This is just here for backwards compatibility
'pad': True,
# Whether to add relu to the downsampled layers.
'relu_downsample_layers': False,
# Whether to add relu to the regular layers
'relu_pred_layers': True,
})
# ----------------------- CONFIG DEFAULTS ----------------------- #
coco_base_config = Config({
'dataset': coco2014_dataset,
'num_classes': 81, # This should include the background class
'max_iter': 400000,
# The maximum number of detections for evaluation
'max_num_detections': 100,
# dw' = momentum * dw - lr * (grad + decay * w)
'lr': 1e-3,
'momentum': 0.9,
'decay': 5e-4,
# For each lr step, what to multiply the lr with
'gamma': 0.1,
'lr_steps': (280000, 360000, 400000),
# Initial learning rate to linearly warmup from (if until > 0)
'lr_warmup_init': 1e-4,
# If > 0 then increase the lr linearly from warmup_init to lr each iter for until iters
'lr_warmup_until': 500,
# The terms to scale the respective loss by
'conf_alpha': 1,
'bbox_alpha': 1.5,
'mask_alpha': 0.4 / 256 * 140 * 140, # Some funky equation. Don't worry about it.
# Eval.py sets this if you just want to run YOLACT as a detector
'eval_mask_branch': True,
# Top_k examples to consider for NMS
'nms_top_k': 200,
# Examples with confidence less than this are not considered by NMS
'nms_conf_thresh': 0.05,
# Boxes with IoU overlap greater than this threshold will be culled during NMS
'nms_thresh': 0.5,
# See mask_type for details.
'mask_type': mask_type.direct,
'mask_size': 16,
'masks_to_train': 100,
'mask_proto_src': None,
'mask_proto_net': [(256, 3, {}), (256, 3, {})],
'mask_proto_bias': False,
'mask_proto_prototype_activation': activation_func.relu,
'mask_proto_mask_activation': activation_func.sigmoid,
'mask_proto_coeff_activation': activation_func.tanh,
'mask_proto_crop': True,
'mask_proto_crop_expand': 0,
'mask_proto_loss': None,
'mask_proto_binarize_downsampled_gt': True,
'mask_proto_normalize_mask_loss_by_sqrt_area': False,
'mask_proto_reweight_mask_loss': False,
'mask_proto_grid_file': 'data/grid.npy',
'mask_proto_use_grid': False,
'mask_proto_coeff_gate': False,
'mask_proto_prototypes_as_features': False,
'mask_proto_prototypes_as_features_no_grad': False,
'mask_proto_remove_empty_masks': False,
'mask_proto_reweight_coeff': 1,
'mask_proto_coeff_diversity_loss': False,
'mask_proto_coeff_diversity_alpha': 1,
'mask_proto_normalize_emulate_roi_pooling': False,
'mask_proto_double_loss': False,
'mask_proto_double_loss_alpha': 1,
'mask_proto_split_prototypes_by_head': False,
'mask_proto_crop_with_pred_box': False,
# SSD data augmentation parameters
# Randomize hue, vibrance, etc.
'augment_photometric_distort': True,
# Have a chance to scale down the image and pad (to emulate smaller detections)
'augment_expand': True,
# Potentialy sample a random crop from the image and put it in a random place
'augment_random_sample_crop': True,
# Mirror the image with a probability of 1/2
'augment_random_mirror': True,
# Flip the image vertically with a probability of 1/2
'augment_random_flip': False,
# With uniform probability, rotate the image [0,90,180,270] degrees
'augment_random_rot90': False,
# Discard detections with width and height smaller than this (in absolute width and height)
'discard_box_width': 4 / 550,
'discard_box_height': 4 / 550,
# If using batchnorm anywhere in the backbone, freeze the batchnorm layer during training.
# Note: any additional batch norm layers after the backbone will not be frozen.
'freeze_bn': False,
# Set this to a config object if you want an FPN (inherit from fpn_base). See fpn_base for details.
'fpn': None,
# Use the same weights for each network head
'share_prediction_module': False,
# For hard negative mining, instead of using the negatives that are leastl confidently background,
# use negatives that are most confidently not background.
'ohem_use_most_confident': False,
# Use focal loss as described in https://arxiv.org/pdf/1708.02002.pdf instead of OHEM
'use_focal_loss': False,
'focal_loss_alpha': 0.25,
'focal_loss_gamma': 2,
# The initial bias toward forground objects, as specified in the focal loss paper
'focal_loss_init_pi': 0.01,
# Keeps track of the average number of examples for each class, and weights the loss for that class accordingly.
'use_class_balanced_conf': False,
# Whether to use sigmoid focal loss instead of softmax, all else being the same.
'use_sigmoid_focal_loss': False,
# Use class[0] to be the objectness score and class[1:] to be the softmax predicted class.
# Note: at the moment this is only implemented if use_focal_loss is on.
'use_objectness_score': False,
# Adds a global pool + fc layer to the smallest selected layer that predicts the existence of each of the 80 classes.
# This branch is only evaluated during training time and is just there for multitask learning.
'use_class_existence_loss': False,
'class_existence_alpha': 1,
# Adds a 1x1 convolution directly to the biggest selected layer that predicts a semantic segmentations for each of the 80 classes.
# This branch is only evaluated during training time and is just there for multitask learning.
'use_semantic_segmentation_loss': False,
'semantic_segmentation_alpha': 1,
# Adds another branch to the netwok to predict Mask IoU.
'use_mask_scoring': False,
'mask_scoring_alpha': 1,
# Match gt boxes using the Box2Pix change metric instead of the standard IoU metric.
# Note that the threshold you set for iou_threshold should be negative with this setting on.
'use_change_matching': False,
# Uses the same network format as mask_proto_net, except this time it's for adding extra head layers before the final
# prediction in prediction modules. If this is none, no extra layers will be added.
'extra_head_net': None,
# What params should the final head layers have (the ones that predict box, confidence, and mask coeffs)
'head_layer_params': {'kernel_size': 3, 'padding': 1},
# Add extra layers between the backbone and the network heads
# The order is (bbox, conf, mask)
'extra_layers': (0, 0, 0),
# During training, to match detections with gt, first compute the maximum gt IoU for each prior.
# Then, any of those priors whose maximum overlap is over the positive threshold, mark as positive.
# For any priors whose maximum is less than the negative iou threshold, mark them as negative.
# The rest are neutral and not used in calculating the loss.
'positive_iou_threshold': 0.5,
'negative_iou_threshold': 0.5,
# When using ohem, the ratio between positives and negatives (3 means 3 negatives to 1 positive)
'ohem_negpos_ratio': 3,
# If less than 1, anchors treated as a negative that have a crowd iou over this threshold with
# the crowd boxes will be treated as a neutral.
'crowd_iou_threshold': 1,
# This is filled in at runtime by Yolact's __init__, so don't touch it
'mask_dim': None,
# Input image size.
'max_size': 300,
# Whether or not to do post processing on the cpu at test time
'force_cpu_nms': True,
# Whether to use mask coefficient cosine similarity nms instead of bbox iou nms
'use_coeff_nms': False,
# Whether or not to have a separate branch whose sole purpose is to act as the coefficients for coeff_diversity_loss
# Remember to turn on coeff_diversity_loss, or these extra coefficients won't do anything!
# To see their effect, also remember to turn on use_coeff_nms.
'use_instance_coeff': False,
'num_instance_coeffs': 64,
# Whether or not to tie the mask loss / box loss to 0
'train_masks': True,
'train_boxes': True,
# If enabled, the gt masks will be cropped using the gt bboxes instead of the predicted ones.
# This speeds up training time considerably but results in much worse mAP at test time.
'use_gt_bboxes': False,
# Whether or not to preserve aspect ratio when resizing the image.
# If True, this will resize all images to be max_size^2 pixels in area while keeping aspect ratio.
# If False, all images are resized to max_size x max_size
'preserve_aspect_ratio': False,
# Whether or not to use the prediction module (c) from DSSD
'use_prediction_module': False,
# Whether or not to use the predicted coordinate scheme from Yolo v2
'use_yolo_regressors': False,
# For training, bboxes are considered "positive" if their anchors have a 0.5 IoU overlap
# or greater with a ground truth box. If this is true, instead of using the anchor boxes
# for this IoU computation, the matching function will use the predicted bbox coordinates.
# Don't turn this on if you're not using yolo regressors!
'use_prediction_matching': False,
# A list of settings to apply after the specified iteration. Each element of the list should look like
# (iteration, config_dict) where config_dict is a dictionary you'd pass into a config object's init.
'delayed_settings': [],
# Use command-line arguments to set this.
'no_jit': False,
'backbone': None,
'name': 'base_config',
# Fast Mask Re-scoring Network
# Inspried by Mask Scoring R-CNN (https://arxiv.org/abs/1903.00241)
# Do not crop out the mask with bbox but slide a convnet on the image-size mask,
# then use global pooling to get the final mask score
'use_maskiou': False,
# Archecture for the mask iou network. A (num_classes-1, 1, {}) layer is appended to the end.
'maskiou_net': [],
# Discard predicted masks whose area is less than this
'discard_mask_area': -1,
'maskiou_alpha': 1.0,
'rescore_mask': False,
'rescore_bbox': False,
'maskious_to_train': -1,
})
# ----------------------- YOLACT v1.0 CONFIGS ----------------------- #
yolact_base_config = coco_base_config.copy({
'name': 'yolact_base',
# Dataset stuff
'dataset': coco2017_dataset,
'num_classes': len(coco2017_dataset.class_names) + 1,
# Image Size
'max_size': 550,
# Training params
'lr_steps': (280000, 600000, 700000, 750000),
'max_iter': 800000,
# Backbone Settings
'backbone': resnet101_backbone.copy({
'selected_layers': list(range(1, 4)),
'use_pixel_scales': True,
'preapply_sqrt': False,
'use_square_anchors': True, # This is for backward compatability with a bug
'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5,
'pred_scales': [[24], [48], [96], [192], [384]],
}),
# FPN Settings
'fpn': fpn_base.copy({
'use_conv_downsample': True,
'num_downsample': 2,
}),
# Mask Settings
'mask_type': mask_type.lincomb,
'mask_alpha': 6.125,
'mask_proto_src': 0,
'mask_proto_net': [(256, 3, {'padding': 1})] * 3 + [(None, -2, {}), (256, 3, {'padding': 1})] + [(32, 1, {})],
'mask_proto_normalize_emulate_roi_pooling': True,
# Other stuff
'share_prediction_module': True,
'extra_head_net': [(256, 3, {'padding': 1})],
'positive_iou_threshold': 0.5,
'negative_iou_threshold': 0.4,
'crowd_iou_threshold': 0.7,
'use_semantic_segmentation_loss': True,
})
yolact_im400_config = yolact_base_config.copy({
'name': 'yolact_im400',
'max_size': 400,
'backbone': yolact_base_config.backbone.copy({
'pred_scales': [[int(x[0] / yolact_base_config.max_size * 400)] for x in yolact_base_config.backbone.pred_scales],
}),
})
yolact_im700_config = yolact_base_config.copy({
'name': 'yolact_im700',
'masks_to_train': 300,
'max_size': 700,
'backbone': yolact_base_config.backbone.copy({
'pred_scales': [[int(x[0] / yolact_base_config.max_size * 700)] for x in yolact_base_config.backbone.pred_scales],
}),
})
yolact_darknet53_config = yolact_base_config.copy({
'name': 'yolact_darknet53',
'backbone': darknet53_backbone.copy({
'selected_layers': list(range(2, 5)),
'pred_scales': yolact_base_config.backbone.pred_scales,
'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios,
'use_pixel_scales': True,
'preapply_sqrt': False,
'use_square_anchors': True, # This is for backward compatability with a bug
}),
})
yolact_resnet50_config = yolact_base_config.copy({
'name': 'yolact_resnet50',
'backbone': resnet50_backbone.copy({
'selected_layers': list(range(1, 4)),
'pred_scales': yolact_base_config.backbone.pred_scales,
'pred_aspect_ratios': yolact_base_config.backbone.pred_aspect_ratios,
'use_pixel_scales': True,
'preapply_sqrt': False,
'use_square_anchors': True, # This is for backward compatability with a bug
}),
})
yolact_resnet50_pascal_config = yolact_resnet50_config.copy({
'name': None, # Will default to yolact_resnet50_pascal
# Dataset stuff
'dataset': pascal_sbd_dataset,
'num_classes': len(pascal_sbd_dataset.class_names) + 1,
'max_iter': 120000,
'lr_steps': (60000, 100000),
'backbone': yolact_resnet50_config.backbone.copy({
'pred_scales': [[32], [64], [128], [256], [512]],
'use_square_anchors': False,
})
})
# ----------------------- YOLACT++ CONFIGS ----------------------- #
yolact_plus_base_config = yolact_base_config.copy({
'name': 'yolact_plus_base',
'backbone': resnet101_dcn_inter3_backbone.copy({
'selected_layers': list(range(1, 4)),
'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5,
'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]],
'use_pixel_scales': True,
'preapply_sqrt': False,
'use_square_anchors': False,
}),
'use_maskiou': True,
'maskiou_net': [(8, 3, {'stride': 2}), (16, 3, {'stride': 2}), (32, 3, {'stride': 2}), (64, 3, {'stride': 2}), (128, 3, {'stride': 2})],
'maskiou_alpha': 25,
'rescore_bbox': False,
'rescore_mask': True,
'discard_mask_area': 5*5,
})
yolact_plus_resnet50_config = yolact_plus_base_config.copy({
'name': 'yolact_plus_resnet50',
'backbone': resnet50_dcnv2_backbone.copy({
'selected_layers': list(range(1, 4)),
'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5,
'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]],
'use_pixel_scales': True,
'preapply_sqrt': False,
'use_square_anchors': False,
}),
})
# Default config
cfg = yolact_base_config.copy()
def set_cfg(config_name:str):
""" Sets the active config. Works even if cfg is already imported! """
global cfg
# Note this is not just an eval because I'm lazy, but also because it can
# be used like ssd300_config.copy({'max_size': 400}) for extreme fine-tuning
cfg.replace(eval(config_name))
if cfg.name is None:
cfg.name = config_name.split('_config')[0]
def set_dataset(dataset_name:str):
""" Sets the dataset of the current config. """
cfg.dataset = eval(dataset_name)
================================================
FILE: data/scripts/COCO.sh
================================================
#!/bin/bash
start=`date +%s`
# handle optional download dir
if [ -z "$1" ]
then
# navigate to ./data
echo "navigating to ./data/ ..."
mkdir -p ./data
cd ./data/
mkdir -p ./coco
cd ./coco
mkdir -p ./images
mkdir -p ./annotations
else
# check if specified dir is valid
if [ ! -d $1 ]; then
echo $1 " is not a valid directory"
exit 0
fi
echo "navigating to " $1 " ..."
cd $1
fi
if [ ! -d images ]
then
mkdir -p ./images
fi
# Download the image data.
cd ./images
echo "Downloading MSCOCO train images ..."
curl -LO http://images.cocodataset.org/zips/train2017.zip
echo "Downloading MSCOCO val images ..."
curl -LO http://images.cocodataset.org/zips/val2017.zip
cd ../
if [ ! -d annotations ]
then
mkdir -p ./annotations
fi
# Download the annotation data.
cd ./annotations
echo "Downloading MSCOCO train/val annotations ..."
curl -LO http://images.cocodataset.org/annotations/annotations_trainval2014.zip
curl -LO http://images.cocodataset.org/annotations/annotations_trainval2017.zip
echo "Finished downloading. Now extracting ..."
# Unzip data
echo "Extracting train images ..."
unzip -qqjd ../images ../images/train2017.zip
echo "Extracting val images ..."
unzip -qqjd ../images ../images/val2017.zip
echo "Extracting annotations ..."
unzip -qqd .. ./annotations_trainval2014.zip
unzip -qqd .. ./annotations_trainval2017.zip
echo "Removing zip files ..."
rm ../images/train2017.zip
rm ../images/val2017.zip
rm ./annotations_trainval2014.zip
rm ./annotations_trainval2017.zip
end=`date +%s`
runtime=$((end-start))
echo "Completed in " $runtime " seconds"
================================================
FILE: data/scripts/COCO_test.sh
================================================
#!/bin/bash
start=`date +%s`
# handle optional download dir
if [ -z "$1" ]
then
# navigate to ./data
echo "navigating to ./data/ ..."
mkdir -p ./data
cd ./data/
mkdir -p ./coco
cd ./coco
mkdir -p ./images
mkdir -p ./annotations
else
# check if specified dir is valid
if [ ! -d $1 ]; then
echo $1 " is not a valid directory"
exit 0
fi
echo "navigating to " $1 " ..."
cd $1
fi
if [ ! -d images ]
then
mkdir -p ./images
fi
# Download the image data.
cd ./images
echo "Downloading MSCOCO test images ..."
curl -LO http://images.cocodataset.org/zips/test2017.zip
cd ../
if [ ! -d annotations ]
then
mkdir -p ./annotations
fi
# Download the annotation data.
cd ./annotations
echo "Downloading MSCOCO test info ..."
curl -LO http://images.cocodataset.org/annotations/image_info_test2017.zip
echo "Finished downloading. Now extracting ..."
# Unzip data
echo "Extracting train images ..."
unzip -qqjd ../images ../images/test2017.zip
echo "Extracting info ..."
unzip -qqd .. ./image_info_test2017.zip
echo "Removing zip files ..."
rm ../images/test2017.zip
rm ./image_info_test2017.zip
end=`date +%s`
runtime=$((end-start))
echo "Completed in " $runtime " seconds"
================================================
FILE: data/scripts/mix_sets.py
================================================
import json
import os
import sys
from collections import defaultdict
usage_text = """
This script creates a coco annotation file by mixing one or more existing annotation files.
Usage: python data/scripts/mix_sets.py output_name [set1 range1 [set2 range2 [...]]]
To use, specify the output annotation name and any number of set + range pairs, where the sets
are in the form instances_.json and ranges are python-evalable ranges. The resulting
json will be spit out as instances_.json in the same folder as the input sets.
For instance,
python data/scripts/mix_sets.py trainval35k train2014 : val2014 :-5000
This will create an instance_trainval35k.json file with all images and corresponding annotations
from train2014 and the first 35000 images from val2014.
You can also specify only one set:
python data/scripts/mix_sets.py minival5k val2014 -5000:
This will take the last 5k images from val2014 and put it in instances_minival5k.json.
"""
annotations_path = 'data/coco/annotations/instances_%s.json'
fields_to_combine = ('images', 'annotations')
fields_to_steal = ('info', 'categories', 'licenses')
if __name__ == '__main__':
if len(sys.argv) < 4 or len(sys.argv) % 2 != 0:
print(usage_text)
exit()
out_name = sys.argv[1]
sets = sys.argv[2:]
sets = [(sets[2*i], sets[2*i+1]) for i in range(len(sets)//2)]
out = {x: [] for x in fields_to_combine}
for idx, (set_name, range_str) in enumerate(sets):
print('Loading set %s...' % set_name)
with open(annotations_path % set_name, 'r') as f:
set_json = json.load(f)
# "Steal" some fields that don't need to be combined from the first set
if idx == 0:
for field in fields_to_steal:
out[field] = set_json[field]
print('Building image index...')
image_idx = {x['id']: x for x in set_json['images']}
print('Collecting annotations...')
anns_idx = defaultdict(lambda: [])
for ann in set_json['annotations']:
anns_idx[ann['image_id']].append(ann)
export_ids = list(image_idx.keys())
export_ids.sort()
export_ids = eval('export_ids[%s]' % range_str, {}, {'export_ids': export_ids})
print('Adding %d images...' % len(export_ids))
for _id in export_ids:
out['images'].append(image_idx[_id])
out['annotations'] += anns_idx[_id]
print('Done.\n')
print('Saving result...')
with open(annotations_path % (out_name), 'w') as out_file:
json.dump(out, out_file)
================================================
FILE: environment.yml
================================================
# Installs dependencies for YOLACT managed by Anaconda.
# Advantage is you get working CUDA+cuDNN+pytorch+torchvison versions.
#
# TODO: you must additionally install nVidia drivers, eg. on Ubuntu linux
# `apt install nvidia-driver-440` (change the 440 for whatever version you need/have).
#
name: yolact-env
#prefix: /your/custom/path/envs/yolact-env
channels:
- conda-forge
- pytorch
- defaults
dependencies:
- python==3.7
- pip
- cython
- pytorch::torchvision
- pytorch::pytorch >=1.0.1
- cudatoolkit
- cudnn
- pytorch::cuda100
- matplotlib
- git # to download COCO dataset
- curl # to download COCO dataset
- unzip # to download COCO dataset
- conda-forge::bash # to download COCO dataset
- pip:
- opencv-python
- pillow <7.0 # bug PILLOW_VERSION in torchvision, must be < 7.0 until torchvision is upgraded
- pycocotools
- PyQt5 # needed on KDE/Qt envs for matplotlib
================================================
FILE: eval.py
================================================
from data import COCODetection, get_label_map, MEANS, COLORS
from yolact import Yolact
from utils.augmentations import BaseTransform, FastBaseTransform, Resize
from utils.functions import MovingAverage, ProgressBar
from layers.box_utils import jaccard, center_size, mask_iou
from utils import timer
from utils.functions import SavePath
from layers.output_utils import postprocess, undo_image_transformation
import pycocotools
from data import cfg, set_cfg, set_dataset
import numpy as np
import torch
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import argparse
import time
import random
import cProfile
import pickle
import json
import os
from collections import defaultdict
from pathlib import Path
from collections import OrderedDict
from PIL import Image
import matplotlib.pyplot as plt
import cv2
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def parse_args(argv=None):
parser = argparse.ArgumentParser(
description='YOLACT COCO Evaluation')
parser.add_argument('--trained_model',
default='weights/ssd300_mAP_77.43_v2.pth', type=str,
help='Trained state_dict file path to open. If "interrupt", this will open the interrupt file.')
parser.add_argument('--top_k', default=5, type=int,
help='Further restrict the number of predictions to parse')
parser.add_argument('--cuda', default=True, type=str2bool,
help='Use cuda to evaulate model')
parser.add_argument('--fast_nms', default=False, type=str2bool,
help='Whether to use a faster, but not entirely correct version of NMS.')
parser.add_argument('--cluster_nms', default=True, type=str2bool,
help='Whether to use a fast and correct version of NMS.')
parser.add_argument('--cluster_diounms', default=True, type=str2bool,
help='Whether to use a fast and correct version of DIoU-NMS.')
parser.add_argument('--spm', default=True, type=str2bool,
help='Whether to use a score penalty mechanism for cluster NMS.')
parser.add_argument('--spm_dist', default=True, type=str2bool,
help='Whether to use a score penalty mechanism + distance for cluster NMS.')
parser.add_argument('--spm_dist_weighted', default=True, type=str2bool,
help='Whether to use a score penalty mechanism + distance + weighted coordinates for cluster NMS.')
parser.add_argument('--cross_class_nms', default=False, type=str2bool,
help='Whether compute NMS cross-class or per-class. It surports above NMS strategies.')
parser.add_argument('--display_masks', default=True, type=str2bool,
help='Whether or not to display masks over bounding boxes')
parser.add_argument('--display_bboxes', default=True, type=str2bool,
help='Whether or not to display bboxes around masks')
parser.add_argument('--display_text', default=True, type=str2bool,
help='Whether or not to display text (class [score])')
parser.add_argument('--display_scores', default=True, type=str2bool,
help='Whether or not to display scores in addition to classes')
parser.add_argument('--display', dest='display', action='store_true',
help='Display qualitative results instead of quantitative ones.')
parser.add_argument('--shuffle', dest='shuffle', action='store_true',
help='Shuffles the images when displaying them. Doesn\'t have much of an effect when display is off though.')
parser.add_argument('--ap_data_file', default='results/ap_data.pkl', type=str,
help='In quantitative mode, the file to save detections before calculating mAP.')
parser.add_argument('--resume', dest='resume', action='store_true',
help='If display not set, this resumes mAP calculations from the ap_data_file.')
parser.add_argument('--max_images', default=-1, type=int,
help='The maximum number of images from the dataset to consider. Use -1 for all.')
parser.add_argument('--output_coco_json', dest='output_coco_json', action='store_true',
help='If display is not set, instead of processing IoU values, this just dumps detections into the coco json file.')
parser.add_argument('--bbox_det_file', default='results/bbox_detections.json', type=str,
help='The output file for coco bbox results if --coco_results is set.')
parser.add_argument('--mask_det_file', default='results/mask_detections.json', type=str,
help='The output file for coco mask results if --coco_results is set.')
parser.add_argument('--config', default=None,
help='The config object to use.')
parser.add_argument('--output_web_json', dest='output_web_json', action='store_true',
help='If display is not set, instead of processing IoU values, this dumps detections for usage with the detections viewer web thingy.')
parser.add_argument('--web_det_path', default='web/dets/', type=str,
help='If output_web_json is set, this is the path to dump detections into.')
parser.add_argument('--no_bar', dest='no_bar', action='store_true',
help='Do not output the status bar. This is useful for when piping to a file.')
parser.add_argument('--display_lincomb', default=False, type=str2bool,
help='If the config uses lincomb masks, output a visualization of how those masks are created.')
parser.add_argument('--benchmark', default=False, dest='benchmark', action='store_true',
help='Equivalent to running display mode but without displaying an image.')
parser.add_argument('--no_sort', default=False, dest='no_sort', action='store_true',
help='Do not sort images by hashed image ID.')
parser.add_argument('--seed', default=None, type=int,
help='The seed to pass into random.seed. Note: this is only really for the shuffle and does not (I think) affect cuda stuff.')
parser.add_argument('--mask_proto_debug', default=False, dest='mask_proto_debug', action='store_true',
help='Outputs stuff for scripts/compute_mask.py.')
parser.add_argument('--no_crop', default=False, dest='crop', action='store_false',
help='Do not crop output masks with the predicted bounding box.')
parser.add_argument('--image', default=None, type=str,
help='A path to an image to use for display.')
parser.add_argument('--images', default=None, type=str,
help='An input folder of images and output folder to save detected images. Should be in the format input->output.')
parser.add_argument('--video', default=None, type=str,
help='A path to a video to evaluate on. Passing in a number will use that index webcam.')
parser.add_argument('--video_multiframe', default=1, type=int,
help='The number of frames to evaluate in parallel to make videos play at higher fps.')
parser.add_argument('--score_threshold', default=0, type=float,
help='Detections with a score under this threshold will not be considered. This currently only works in display mode.')
parser.add_argument('--dataset', default=None, type=str,
help='If specified, override the dataset specified in the config with this one (example: coco2017_dataset).')
parser.add_argument('--detect', default=False, dest='detect', action='store_true',
help='Don\'t evauluate the mask branch at all and only do object detection. This only works for --display and --benchmark.')
parser.add_argument('--display_fps', default=False, dest='display_fps', action='store_true',
help='When displaying / saving video, draw the FPS on the frame')
parser.add_argument('--emulate_playback', default=False, dest='emulate_playback', action='store_true',
help='When saving a video, emulate the framerate that you\'d get running in real-time mode.')
parser.set_defaults(no_bar=False, display=False, resume=False, output_coco_json=False, output_web_json=False, shuffle=False,
benchmark=False, no_sort=False, no_hash=False, mask_proto_debug=False, crop=True, detect=False, display_fps=False,
emulate_playback=False)
global args
args = parser.parse_args(argv)
if args.output_web_json:
args.output_coco_json = True
if args.seed is not None:
random.seed(args.seed)
iou_thresholds = [x / 100 for x in range(50, 100, 5)]
coco_cats = {} # Call prep_coco_cats to fill this
coco_cats_inv = {}
color_cache = defaultdict(lambda: {})
def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str=''):
"""
Note: If undo_transform=False then im_h and im_w are allowed to be None.
"""
if undo_transform:
img_numpy = undo_image_transformation(img, w, h)
img_gpu = torch.Tensor(img_numpy).cuda()
else:
img_gpu = img / 255.0
h, w, _ = img.shape
with timer.env('Postprocess'):
save = cfg.rescore_bbox
cfg.rescore_bbox = True
t = postprocess(dets_out, w, h, visualize_lincomb = args.display_lincomb,
crop_masks = args.crop,
score_threshold = args.score_threshold)
cfg.rescore_bbox = save
with timer.env('Copy'):
idx = t[1].argsort(0, descending=True)[:args.top_k]
if cfg.eval_mask_branch:
# Masks are drawn on the GPU, so don't copy
masks = t[3][idx]
classes, scores, boxes = [x[idx].cpu().numpy() for x in t[:3]]
num_dets_to_consider = min(args.top_k, classes.shape[0])
for j in range(num_dets_to_consider):
if scores[j] < args.score_threshold:
num_dets_to_consider = j
break
# Quick and dirty lambda for selecting the color for a particular index
# Also keeps track of a per-gpu color cache for maximum speed
def get_color(j, on_gpu=None):
global color_cache
color_idx = (classes[j] * 5 if class_color else j * 5) % len(COLORS)
if on_gpu is not None and color_idx in color_cache[on_gpu]:
return color_cache[on_gpu][color_idx]
else:
color = COLORS[color_idx]
if not undo_transform:
# The image might come in as RGB or BRG, depending
color = (color[2], color[1], color[0])
if on_gpu is not None:
color = torch.Tensor(color).to(on_gpu).float() / 255.
color_cache[on_gpu][color_idx] = color
return color
# First, draw the masks on the GPU where we can do it really fast
# Beware: very fast but possibly unintelligible mask-drawing code ahead
# I wish I had access to OpenGL or Vulkan but alas, I guess Pytorch tensor operations will have to suffice
if args.display_masks and cfg.eval_mask_branch and num_dets_to_consider > 0:
# After this, mask is of size [num_dets, h, w, 1]
masks = masks[:num_dets_to_consider, :, :, None]
# Prepare the RGB images for each mask given their color (size [num_dets, h, w, 1])
colors = torch.cat([get_color(j, on_gpu=img_gpu.device.index).view(1, 1, 1, 3) for j in range(num_dets_to_consider)], dim=0)
masks_color = masks.repeat(1, 1, 1, 3) * colors * mask_alpha
# This is 1 everywhere except for 1-mask_alpha where the mask is
inv_alph_masks = masks * (-mask_alpha) + 1
# I did the math for this on pen and paper. This whole block should be equivalent to:
# for j in range(num_dets_to_consider):
# img_gpu = img_gpu * inv_alph_masks[j] + masks_color[j]
masks_color_summand = masks_color[0]
if num_dets_to_consider > 1:
inv_alph_cumul = inv_alph_masks[:(num_dets_to_consider-1)].cumprod(dim=0)
masks_color_cumul = masks_color[1:] * inv_alph_cumul
masks_color_summand += masks_color_cumul.sum(dim=0)
img_gpu = img_gpu * inv_alph_masks.prod(dim=0) + masks_color_summand
if args.display_fps:
# Draw the box for the fps on the GPU
font_face = cv2.FONT_HERSHEY_DUPLEX
font_scale = 0.6
font_thickness = 1
text_w, text_h = cv2.getTextSize(fps_str, font_face, font_scale, font_thickness)[0]
img_gpu[0:text_h+8, 0:text_w+8] *= 0.6 # 1 - Box alpha
# Then draw the stuff that needs to be done on the cpu
# Note, make sure this is a uint8 tensor or opencv will not anti alias text for whatever reason
img_numpy = (img_gpu * 255).byte().cpu().numpy()
if args.display_fps:
# Draw the text on the CPU
text_pt = (4, text_h + 2)
text_color = [255, 255, 255]
cv2.putText(img_numpy, fps_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA)
if num_dets_to_consider == 0:
return img_numpy
if args.display_text or args.display_bboxes:
for j in reversed(range(num_dets_to_consider)):
x1, y1, x2, y2 = boxes[j, :]
color = get_color(j)
score = scores[j]
if args.display_bboxes:
cv2.rectangle(img_numpy, (x1, y1), (x2, y2), color, 1)
if args.display_text:
_class = cfg.dataset.class_names[classes[j]]
text_str = '%s: %.2f' % (_class, score) if args.display_scores else _class
font_face = cv2.FONT_HERSHEY_DUPLEX
font_scale = 0.6
font_thickness = 1
text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0]
text_pt = (x1, y1 - 3)
text_color = [255, 255, 255]
cv2.rectangle(img_numpy, (x1, y1), (x1 + text_w, y1 - text_h - 4), color, -1)
cv2.putText(img_numpy, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA)
return img_numpy
def prep_benchmark(dets_out, h, w):
with timer.env('Postprocess'):
t = postprocess(dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold)
with timer.env('Copy'):
classes, scores, boxes, masks = [x[:args.top_k] for x in t]
if isinstance(scores, list):
box_scores = scores[0].cpu().numpy()
mask_scores = scores[1].cpu().numpy()
else:
scores = scores.cpu().numpy()
classes = classes.cpu().numpy()
boxes = boxes.cpu().numpy()
masks = masks.cpu().numpy()
with timer.env('Sync'):
# Just in case
torch.cuda.synchronize()
def prep_coco_cats():
""" Prepare inverted table for category id lookup given a coco cats object. """
for coco_cat_id, transformed_cat_id_p1 in get_label_map().items():
transformed_cat_id = transformed_cat_id_p1 - 1
coco_cats[transformed_cat_id] = coco_cat_id
coco_cats_inv[coco_cat_id] = transformed_cat_id
def get_coco_cat(transformed_cat_id):
""" transformed_cat_id is [0,80) as indices in cfg.dataset.class_names """
return coco_cats[transformed_cat_id]
def get_transformed_cat(coco_cat_id):
""" transformed_cat_id is [0,80) as indices in cfg.dataset.class_names """
return coco_cats_inv[coco_cat_id]
class Detections:
def __init__(self):
self.bbox_data = []
self.mask_data = []
def add_bbox(self, image_id:int, category_id:int, bbox:list, score:float):
""" Note that bbox should be a list or tuple of (x1, y1, x2, y2) """
bbox = [bbox[0], bbox[1], bbox[2]-bbox[0], bbox[3]-bbox[1]]
# Round to the nearest 10th to avoid huge file sizes, as COCO suggests
bbox = [round(float(x)*10)/10 for x in bbox]
self.bbox_data.append({
'image_id': int(image_id),
'category_id': get_coco_cat(int(category_id)),
'bbox': bbox,
'score': float(score)
})
def add_mask(self, image_id:int, category_id:int, segmentation:np.ndarray, score:float):
""" The segmentation should be the full mask, the size of the image and with size [h, w]. """
rle = pycocotools.mask.encode(np.asfortranarray(segmentation.astype(np.uint8)))
rle['counts'] = rle['counts'].decode('ascii') # json.dump doesn't like bytes strings
self.mask_data.append({
'image_id': int(image_id),
'category_id': get_coco_cat(int(category_id)),
'segmentation': rle,
'score': float(score)
})
def dump(self):
dump_arguments = [
(self.bbox_data, args.bbox_det_file),
(self.mask_data, args.mask_det_file)
]
for data, path in dump_arguments:
with open(path, 'w') as f:
json.dump(data, f)
def dump_web(self):
""" Dumps it in the format for my web app. Warning: bad code ahead! """
config_outs = ['preserve_aspect_ratio', 'use_prediction_module',
'use_yolo_regressors', 'use_prediction_matching',
'train_masks']
output = {
'info' : {
'Config': {key: getattr(cfg, key) for key in config_outs},
}
}
image_ids = list(set([x['image_id'] for x in self.bbox_data]))
image_ids.sort()
image_lookup = {_id: idx for idx, _id in enumerate(image_ids)}
output['images'] = [{'image_id': image_id, 'dets': []} for image_id in image_ids]
# These should already be sorted by score with the way prep_metrics works.
for bbox, mask in zip(self.bbox_data, self.mask_data):
image_obj = output['images'][image_lookup[bbox['image_id']]]
image_obj['dets'].append({
'score': bbox['score'],
'bbox': bbox['bbox'],
'category': cfg.dataset.class_names[get_transformed_cat(bbox['category_id'])],
'mask': mask['segmentation'],
})
with open(os.path.join(args.web_det_path, '%s.json' % cfg.name), 'w') as f:
json.dump(output, f)
def _mask_iou(mask1, mask2, iscrowd=False):
with timer.env('Mask IoU'):
ret = mask_iou(mask1, mask2, iscrowd)
return ret.cpu()
def _bbox_iou(bbox1, bbox2, iscrowd=False):
with timer.env('BBox IoU'):
ret = jaccard(bbox1, bbox2, iscrowd)
return ret.cpu()
def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, detections:Detections=None):
""" Returns a list of APs for this image, with each element being for a class """
if not args.output_coco_json:
with timer.env('Prepare gt'):
gt_boxes = torch.Tensor(gt[:, :4])
gt_boxes[:, [0, 2]] *= w
gt_boxes[:, [1, 3]] *= h
gt_classes = list(gt[:, 4].astype(int))
gt_masks = torch.Tensor(gt_masks).view(-1, h*w)
if num_crowd > 0:
split = lambda x: (x[-num_crowd:], x[:-num_crowd])
crowd_boxes , gt_boxes = split(gt_boxes)
crowd_masks , gt_masks = split(gt_masks)
crowd_classes, gt_classes = split(gt_classes)
with timer.env('Postprocess'):
classes, scores, boxes, masks = postprocess(dets, w, h, crop_masks=args.crop, score_threshold=args.score_threshold)
if classes.size(0) == 0:
return
classes = list(classes.cpu().numpy().astype(int))
if isinstance(scores, list):
box_scores = list(scores[0].cpu().numpy().astype(float))
mask_scores = list(scores[1].cpu().numpy().astype(float))
else:
scores = list(scores.cpu().numpy().astype(float))
box_scores = scores
mask_scores = scores
masks = masks.view(-1, h*w).cuda()
boxes = boxes.cuda()
if args.output_coco_json:
with timer.env('JSON Output'):
boxes = boxes.cpu().numpy()
masks = masks.view(-1, h, w).cpu().numpy()
for i in range(masks.shape[0]):
# Make sure that the bounding box actually makes sense and a mask was produced
if (boxes[i, 3] - boxes[i, 1]) * (boxes[i, 2] - boxes[i, 0]) > 0:
detections.add_bbox(image_id, classes[i], boxes[i,:], box_scores[i])
detections.add_mask(image_id, classes[i], masks[i,:,:], mask_scores[i])
return
with timer.env('Eval Setup'):
num_pred = len(classes)
num_gt = len(gt_classes)
mask_iou_cache = _mask_iou(masks, gt_masks)
bbox_iou_cache = _bbox_iou(boxes.float(), gt_boxes.float())
if num_crowd > 0:
crowd_mask_iou_cache = _mask_iou(masks, crowd_masks, iscrowd=True)
crowd_bbox_iou_cache = _bbox_iou(boxes.float(), crowd_boxes.float(), iscrowd=True)
else:
crowd_mask_iou_cache = None
crowd_bbox_iou_cache = None
box_indices = sorted(range(num_pred), key=lambda i: -box_scores[i])
mask_indices = sorted(box_indices, key=lambda i: -mask_scores[i])
iou_types = [
('box', lambda i,j: bbox_iou_cache[i, j].item(),
lambda i,j: crowd_bbox_iou_cache[i,j].item(),
lambda i: box_scores[i], box_indices),
('mask', lambda i,j: mask_iou_cache[i, j].item(),
lambda i,j: crowd_mask_iou_cache[i,j].item(),
lambda i: mask_scores[i], mask_indices)
]
timer.start('Main loop')
for _class in set(classes + gt_classes):
ap_per_iou = []
num_gt_for_class = sum([1 for x in gt_classes if x == _class])
for iouIdx in range(len(iou_thresholds)):
iou_threshold = iou_thresholds[iouIdx]
for iou_type, iou_func, crowd_func, score_func, indices in iou_types:
gt_used = [False] * len(gt_classes)
ap_obj = ap_data[iou_type][iouIdx][_class]
ap_obj.add_gt_positives(num_gt_for_class)
for i in indices:
if classes[i] != _class:
continue
max_iou_found = iou_threshold
max_match_idx = -1
for j in range(num_gt):
if gt_used[j] or gt_classes[j] != _class:
continue
iou = iou_func(i, j)
if iou > max_iou_found:
max_iou_found = iou
max_match_idx = j
if max_match_idx >= 0:
gt_used[max_match_idx] = True
ap_obj.push(score_func(i), True)
else:
# If the detection matches a crowd, we can just ignore it
matched_crowd = False
if num_crowd > 0:
for j in range(len(crowd_classes)):
if crowd_classes[j] != _class:
continue
iou = crowd_func(i, j)
if iou > iou_threshold:
matched_crowd = True
break
# All this crowd code so that we can make sure that our eval code gives the
# same result as COCOEval. There aren't even that many crowd annotations to
# begin with, but accuracy is of the utmost importance.
if not matched_crowd:
ap_obj.push(score_func(i), False)
timer.stop('Main loop')
class APDataObject:
"""
Stores all the information necessary to calculate the AP for one IoU and one class.
Note: I type annotated this because why not.
"""
def __init__(self):
self.data_points = []
self.num_gt_positives = 0
def push(self, score:float, is_true:bool):
self.data_points.append((score, is_true))
def add_gt_positives(self, num_positives:int):
""" Call this once per image. """
self.num_gt_positives += num_positives
def is_empty(self) -> bool:
return len(self.data_points) == 0 and self.num_gt_positives == 0
def get_ap(self) -> float:
""" Warning: result not cached. """
if self.num_gt_positives == 0:
return 0
# Sort descending by score
self.data_points.sort(key=lambda x: -x[0])
precisions = []
recalls = []
num_true = 0
num_false = 0
# Compute the precision-recall curve. The x axis is recalls and the y axis precisions.
for datum in self.data_points:
# datum[1] is whether the detection a true or false positive
if datum[1]: num_true += 1
else: num_false += 1
precision = num_true / (num_true + num_false)
recall = num_true / self.num_gt_positives
precisions.append(precision)
recalls.append(recall)
# Smooth the curve by computing [max(precisions[i:]) for i in range(len(precisions))]
# Basically, remove any temporary dips from the curve.
# At least that's what I think, idk. COCOEval did it so I do too.
for i in range(len(precisions)-1, 0, -1):
if precisions[i] > precisions[i-1]:
precisions[i-1] = precisions[i]
# Compute the integral of precision(recall) d_recall from recall=0->1 using fixed-length riemann summation with 101 bars.
y_range = [0] * 101 # idx 0 is recall == 0.0 and idx 100 is recall == 1.00
x_range = np.array([x / 100 for x in range(101)])
recalls = np.array(recalls)
# I realize this is weird, but all it does is find the nearest precision(x) for a given x in x_range.
# Basically, if the closest recall we have to 0.01 is 0.009 this sets precision(0.01) = precision(0.009).
# I approximate the integral this way, because that's how COCOEval does it.
indices = np.searchsorted(recalls, x_range, side='left')
for bar_idx, precision_idx in enumerate(indices):
if precision_idx < len(precisions):
y_range[bar_idx] = precisions[precision_idx]
# Finally compute the riemann sum to get our integral.
# avg([precision(x) for x in 0:0.01:1])
return sum(y_range) / len(y_range)
def badhash(x):
"""
Just a quick and dirty hash function for doing a deterministic shuffle based on image_id.
Source:
https://stackoverflow.com/questions/664014/what-integer-hash-function-are-good-that-accepts-an-integer-hash-key
"""
x = (((x >> 16) ^ x) * 0x045d9f3b) & 0xFFFFFFFF
x = (((x >> 16) ^ x) * 0x045d9f3b) & 0xFFFFFFFF
x = ((x >> 16) ^ x) & 0xFFFFFFFF
return x
def evalimage(net:Yolact, path:str, save_path:str=None):
frame = torch.from_numpy(cv2.imread(path)).cuda().float()
batch = FastBaseTransform()(frame.unsqueeze(0))
preds = net(batch)
img_numpy = prep_display(preds, frame, None, None, undo_transform=False)
if save_path is None:
img_numpy = img_numpy[:, :, (2, 1, 0)]
if save_path is None:
plt.imshow(img_numpy)
plt.title(path)
plt.show()
else:
cv2.imwrite(save_path, img_numpy)
def evalimages(net:Yolact, input_folder:str, output_folder:str):
if not os.path.exists(output_folder):
os.mkdir(output_folder)
print()
for p in Path(input_folder).glob('*'):
path = str(p)
name = os.path.basename(path)
name = '.'.join(name.split('.')[:-1]) + '.png'
out_path = os.path.join(output_folder, name)
evalimage(net, path, out_path)
print(path + ' -> ' + out_path)
print('Done.')
from multiprocessing.pool import ThreadPool
from queue import Queue
class CustomDataParallel(torch.nn.DataParallel):
""" A Custom Data Parallel class that properly gathers lists of dictionaries. """
def gather(self, outputs, output_device):
# Note that I don't actually want to convert everything to the output_device
return sum(outputs, [])
def evalvideo(net:Yolact, path:str, out_path:str=None):
# If the path is a digit, parse it as a webcam index
is_webcam = path.isdigit()
# If the input image size is constant, this make things faster (hence why we can use it in a video setting).
cudnn.benchmark = True
if is_webcam:
vid = cv2.VideoCapture(int(path))
else:
vid = cv2.VideoCapture(path)
if not vid.isOpened():
print('Could not open video "%s"' % path)
exit(-1)
target_fps = round(vid.get(cv2.CAP_PROP_FPS))
frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = round(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
if is_webcam:
num_frames = float('inf')
else:
num_frames = round(vid.get(cv2.CAP_PROP_FRAME_COUNT))
net = CustomDataParallel(net).cuda()
transform = torch.nn.DataParallel(FastBaseTransform()).cuda()
frame_times = MovingAverage(100)
fps = 0
frame_time_target = 1 / target_fps
running = True
fps_str = ''
vid_done = False
frames_displayed = 0
if out_path is not None:
out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*"mp4v"), target_fps, (frame_width, frame_height))
def cleanup_and_exit():
print()
pool.terminate()
vid.release()
if out_path is not None:
out.release()
cv2.destroyAllWindows()
exit()
def get_next_frame(vid):
frames = []
for idx in range(args.video_multiframe):
frame = vid.read()[1]
if frame is None:
return frames
frames.append(frame)
return frames
def transform_frame(frames):
with torch.no_grad():
frames = [torch.from_numpy(frame).cuda().float() for frame in frames]
return frames, transform(torch.stack(frames, 0))
def eval_network(inp):
with torch.no_grad():
frames, imgs = inp
num_extra = 0
while imgs.size(0) < args.video_multiframe:
imgs = torch.cat([imgs, imgs[0].unsqueeze(0)], dim=0)
num_extra += 1
out = net(imgs)
if num_extra > 0:
out = out[:-num_extra]
return frames, out
def prep_frame(inp, fps_str):
with torch.no_grad():
frame, preds = inp
return prep_display(preds, frame, None, None, undo_transform=False, class_color=True, fps_str=fps_str)
frame_buffer = Queue()
video_fps = 0
# All this timing code to make sure that
def play_video():
try:
nonlocal frame_buffer, running, video_fps, is_webcam, num_frames, frames_displayed, vid_done
video_frame_times = MovingAverage(100)
frame_time_stabilizer = frame_time_target
last_time = None
stabilizer_step = 0.0005
progress_bar = ProgressBar(30, num_frames)
while running:
frame_time_start = time.time()
if not frame_buffer.empty():
next_time = time.time()
if last_time is not None:
video_frame_times.add(next_time - last_time)
video_fps = 1 / video_frame_times.get_avg()
if out_path is None:
cv2.imshow(path, frame_buffer.get())
else:
out.write(frame_buffer.get())
frames_displayed += 1
last_time = next_time
if out_path is not None:
if video_frame_times.get_avg() == 0:
fps = 0
else:
fps = 1 / video_frame_times.get_avg()
progress = frames_displayed / num_frames * 100
progress_bar.set_val(frames_displayed)
print('\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps '
% (repr(progress_bar), frames_displayed, num_frames, progress, fps), end='')
# This is split because you don't want savevideo to require cv2 display functionality (see #197)
if out_path is None and cv2.waitKey(1) == 27:
# Press Escape to close
running = False
if not (frames_displayed < num_frames):
running = False
if not vid_done:
buffer_size = frame_buffer.qsize()
if buffer_size < args.video_multiframe:
frame_time_stabilizer += stabilizer_step
elif buffer_size > args.video_multiframe:
frame_time_stabilizer -= stabilizer_step
if frame_time_stabilizer < 0:
frame_time_stabilizer = 0
new_target = frame_time_stabilizer if is_webcam else max(frame_time_stabilizer, frame_time_target)
else:
new_target = frame_time_target
next_frame_target = max(2 * new_target - video_frame_times.get_avg(), 0)
target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe
if out_path is None or args.emulate_playback:
# This gives more accurate timing than if sleeping the whole amount at once
while time.time() < target_time:
time.sleep(0.001)
else:
# Let's not starve the main thread, now
time.sleep(0.001)
except:
# See issue #197 for why this is necessary
import traceback
traceback.print_exc()
extract_frame = lambda x, i: (x[0][i] if x[1][i]['detection'] is None else x[0][i].to(x[1][i]['detection']['box'].device), [x[1][i]])
# Prime the network on the first frame because I do some thread unsafe things otherwise
print('Initializing model... ', end='')
first_batch = eval_network(transform_frame(get_next_frame(vid)))
print('Done.')
# For each frame the sequence of functions it needs to go through to be processed (in reversed order)
sequence = [prep_frame, eval_network, transform_frame]
pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2)
pool.apply_async(play_video)
active_frames = [{'value': extract_frame(first_batch, i), 'idx': 0} for i in range(len(first_batch[0]))]
print()
if out_path is None: print('Press Escape to close.')
try:
while vid.isOpened() and running:
# Hard limit on frames in buffer so we don't run out of memory >.>
while frame_buffer.qsize() > 100:
time.sleep(0.001)
start_time = time.time()
# Start loading the next frames from the disk
if not vid_done:
next_frames = pool.apply_async(get_next_frame, args=(vid,))
else:
next_frames = None
if not (vid_done and len(active_frames) == 0):
# For each frame in our active processing queue, dispatch a job
# for that frame using the current function in the sequence
for frame in active_frames:
_args = [frame['value']]
if frame['idx'] == 0:
_args.append(fps_str)
frame['value'] = pool.apply_async(sequence[frame['idx']], args=_args)
# For each frame whose job was the last in the sequence (i.e. for all final outputs)
for frame in active_frames:
if frame['idx'] == 0:
frame_buffer.put(frame['value'].get())
# Remove the finished frames from the processing queue
active_frames = [x for x in active_frames if x['idx'] > 0]
# Finish evaluating every frame in the processing queue and advanced their position in the sequence
for frame in list(reversed(active_frames)):
frame['value'] = frame['value'].get()
frame['idx'] -= 1
if frame['idx'] == 0:
# Split this up into individual threads for prep_frame since it doesn't support batch size
active_frames += [{'value': extract_frame(frame['value'], i), 'idx': 0} for i in range(1, len(frame['value'][0]))]
frame['value'] = extract_frame(frame['value'], 0)
# Finish loading in the next frames and add them to the processing queue
if next_frames is not None:
frames = next_frames.get()
if len(frames) == 0:
vid_done = True
else:
active_frames.append({'value': frames, 'idx': len(sequence)-1})
# Compute FPS
frame_times.add(time.time() - start_time)
fps = args.video_multiframe / frame_times.get_avg()
else:
fps = 0
fps_str = 'Processing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d' % (fps, video_fps, frame_buffer.qsize())
if not args.display_fps:
print('\r' + fps_str + ' ', end='')
except KeyboardInterrupt:
print('\nStopping...')
cleanup_and_exit()
def evaluate(net:Yolact, dataset, train_mode=False):
net.detect.use_fast_nms = args.fast_nms
net.detect.use_cluster_nms = args.cluster_nms
net.detect.use_cluster_diounms = args.cluster_diounms
net.detect.use_spm_nms = args.spm
net.detect.use_spm_dist_nms = args.spm_dist
net.detect.use_spm_dist_weighted_nms = args.spm_dist_weighted
net.detect.use_cross_class_nms = args.cross_class_nms
cfg.mask_proto_debug = args.mask_proto_debug
# TODO Currently we do not support Fast Mask Re-scroing in evalimage, evalimages, and evalvideo
if args.image is not None:
if ':' in args.image:
inp, out = args.image.split(':')
evalimage(net, inp, out)
else:
evalimage(net, args.image)
return
elif args.images is not None:
inp, out = args.images.split(':')
evalimages(net, inp, out)
return
elif args.video is not None:
if ':' in args.video:
inp, out = args.video.split(':')
evalvideo(net, inp, out)
else:
evalvideo(net, args.video)
return
frame_times = MovingAverage()
dataset_size = len(dataset) if args.max_images < 0 else min(args.max_images, len(dataset))
progress_bar = ProgressBar(30, dataset_size)
print()
if not args.display and not args.benchmark:
# For each class and iou, stores tuples (score, isPositive)
# Index ap_data[type][iouIdx][classIdx]
ap_data = {
'box' : [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds],
'mask': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds]
}
detections = Detections()
else:
timer.disable('Load Data')
dataset_indices = list(range(len(dataset)))
if args.shuffle:
random.shuffle(dataset_indices)
elif not args.no_sort:
# Do a deterministic shuffle based on the image ids
#
# I do this because on python 3.5 dictionary key order is *random*, while in 3.6 it's
# the order of insertion. That means on python 3.6, the images come in the order they are in
# in the annotations file. For some reason, the first images in the annotations file are
# the hardest. To combat this, I use a hard-coded hash function based on the image ids
# to shuffle the indices we use. That way, no matter what python version or how pycocotools
# handles the data, we get the same result every time.
hashed = [badhash(x) for x in dataset.ids]
dataset_indices.sort(key=lambda x: hashed[x])
dataset_indices = dataset_indices[:dataset_size]
try:
# Main eval loop
for it, image_idx in enumerate(dataset_indices):
timer.reset()
with timer.env('Load Data'):
img, gt, gt_masks, h, w, num_crowd = dataset.pull_item(image_idx)
# Test flag, do not upvote
if cfg.mask_proto_debug:
with open('scripts/info.txt', 'w') as f:
f.write(str(dataset.ids[image_idx]))
np.save('scripts/gt.npy', gt_masks)
batch = Variable(img.unsqueeze(0))
if args.cuda:
batch = batch.cuda()
with timer.env('Network Extra'):
preds = net(batch)
# Perform the meat of the operation here depending on our mode.
if args.display:
img_numpy = prep_display(preds, img, h, w)
elif args.benchmark:
prep_benchmark(preds, h, w)
else:
prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections)
# First couple of images take longer because we're constructing the graph.
# Since that's technically initialization, don't include those in the FPS calculations.
if it > 1:
frame_times.add(timer.total_time())
if args.display:
if it > 1:
print('Avg FPS: %.4f' % (1 / frame_times.get_avg()))
plt.imshow(img_numpy)
plt.title(str(dataset.ids[image_idx]))
plt.show()
elif not args.no_bar:
if it > 1: fps = 1 / frame_times.get_avg()
else: fps = 0
progress = (it+1) / dataset_size * 100
progress_bar.set_val(it+1)
print('\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps '
% (repr(progress_bar), it+1, dataset_size, progress, fps), end='')
if not args.display and not args.benchmark:
print()
if args.output_coco_json:
print('Dumping detections...')
if args.output_web_json:
detections.dump_web()
else:
detections.dump()
else:
if not train_mode:
print('Saving data...')
with open(args.ap_data_file, 'wb') as f:
pickle.dump(ap_data, f)
return calc_map(ap_data)
elif args.benchmark:
print()
print()
print('Stats for the last frame:')
timer.print_stats()
avg_seconds = frame_times.get_avg()
print('Average: %5.2f fps, %5.2f ms' % (1 / frame_times.get_avg(), 1000*avg_seconds))
except KeyboardInterrupt:
print('Stopping...')
def calc_map(ap_data):
print('Calculating mAP...')
aps = [{'box': [], 'mask': []} for _ in iou_thresholds]
for _class in range(len(cfg.dataset.class_names)):
for iou_idx in range(len(iou_thresholds)):
for iou_type in ('box', 'mask'):
ap_obj = ap_data[iou_type][iou_idx][_class]
if not ap_obj.is_empty():
aps[iou_idx][iou_type].append(ap_obj.get_ap())
all_maps = {'box': OrderedDict(), 'mask': OrderedDict()}
# Looking back at it, this code is really hard to read :/
for iou_type in ('box', 'mask'):
all_maps[iou_type]['all'] = 0 # Make this first in the ordereddict
for i, threshold in enumerate(iou_thresholds):
mAP = sum(aps[i][iou_type]) / len(aps[i][iou_type]) * 100 if len(aps[i][iou_type]) > 0 else 0
all_maps[iou_type][int(threshold*100)] = mAP
all_maps[iou_type]['all'] = (sum(all_maps[iou_type].values()) / (len(all_maps[iou_type].values())-1))
print_maps(all_maps)
# Put in a prettier format so we can serialize it to json during training
all_maps = {k: {j: round(u, 2) for j, u in v.items()} for k, v in all_maps.items()}
return all_maps
def print_maps(all_maps):
# Warning: hacky
make_row = lambda vals: (' %5s |' * len(vals)) % tuple(vals)
make_sep = lambda n: ('-------+' * n)
print()
print(make_row([''] + [('.%d ' % x if isinstance(x, int) else x + ' ') for x in all_maps['box'].keys()]))
print(make_sep(len(all_maps['box']) + 1))
for iou_type in ('box', 'mask'):
print(make_row([iou_type] + ['%.2f' % x if x < 100 else '%.1f' % x for x in all_maps[iou_type].values()]))
print(make_sep(len(all_maps['box']) + 1))
print()
if __name__ == '__main__':
parse_args()
if args.config is not None:
set_cfg(args.config)
if args.trained_model == 'interrupt':
args.trained_model = SavePath.get_interrupt('weights/')
elif args.trained_model == 'latest':
args.trained_model = SavePath.get_latest('weights/', cfg.name)
if args.config is None:
model_path = SavePath.from_str(args.trained_model)
# TODO: Bad practice? Probably want to do a name lookup instead.
args.config = model_path.model_name + '_config'
print('Config not specified. Parsed %s from the file name.\n' % args.config)
set_cfg(args.config)
num_count=0
if args.cross_class_nms = True:
nms='cross class'
else:
nms='not use cross class'
if args.fast_nms = True:
num_count = num_count + 1
if args.cluster_nms = True:
num_count = num_count + 1
if args.cluster_diounms = True:
num_count = num_count + 1
if args.spm = True:
num_count = num_count + 1
if args.spm_dist = True:
num_count = num_count + 1
if args.spm_dist_weighted = True:
num_count = num_count + 1
if num_count>1:
assert Exception("You must choose one NMS strategy. Options: fast_nms, cluster_nms, cluster_diounms, spm, spm_dist, spm_dist_weighted.")
if args.detect:
cfg.eval_mask_branch = False
if args.dataset is not None:
set_dataset(args.dataset)
with torch.no_grad():
if not os.path.exists('results'):
os.makedirs('results')
if args.cuda:
cudnn.fastest = True
torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
torch.set_default_tensor_type('torch.FloatTensor')
if args.resume and not args.display:
with open(args.ap_data_file, 'rb') as f:
ap_data = pickle.load(f)
calc_map(ap_data)
exit()
if args.image is None and args.video is None and args.images is None:
dataset = COCODetection(cfg.dataset.valid_images, cfg.dataset.valid_info,
transform=BaseTransform(), has_gt=cfg.dataset.has_gt)
prep_coco_cats()
else:
dataset = None
print('Loading model...', end='')
net = Yolact()
net.load_weights(args.trained_model)
net.eval()
print(' Done.')
if args.cuda:
net = net.cuda()
evaluate(net, dataset)
================================================
FILE: external/DCNv2/LICENSE
================================================
BSD 3-Clause License
Copyright (c) 2019, Charles Shang
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: external/DCNv2/README.md
================================================
## Deformable Convolutional Networks V2 with Pytorch 1.0
### Build
```bash
./make.sh # build
python test.py # run examples and gradient check
```
### An Example
- deformable conv
```python
from dcn_v2 import DCN
input = torch.randn(2, 64, 128, 128).cuda()
# wrap all things (offset and mask) in DCN
dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda()
output = dcn(input)
print(output.shape)
```
- deformable roi pooling
```python
from dcn_v2 import DCNPooling
input = torch.randn(2, 32, 64, 64).cuda()
batch_inds = torch.randint(2, (20, 1)).cuda().float()
x = torch.randint(256, (20, 1)).cuda().float()
y = torch.randint(256, (20, 1)).cuda().float()
w = torch.randint(64, (20, 1)).cuda().float()
h = torch.randint(64, (20, 1)).cuda().float()
rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
# mdformable pooling (V2)
# wrap all things (offset and mask) in DCNPooling
dpooling = DCNPooling(spatial_scale=1.0 / 4,
pooled_size=7,
output_dim=32,
no_trans=False,
group_size=1,
trans_std=0.1).cuda()
dout = dpooling(input, rois)
```
### Note
Now the master branch is for pytorch 1.0 (new ATen API), you can switch back to pytorch 0.4 with,
```bash
git checkout pytorch_0.4
```
### Known Issues:
- [x] Gradient check w.r.t offset (solved)
- [ ] Backward is not reentrant (minor)
This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op).
I have ran the gradient check for many times with DOUBLE type. Every tensor **except offset** passes.
However, when I set the offset to 0.5, it passes. I'm still wondering what cause this problem. Is it because some
non-differential points?
Update: all gradient check passes with double precision.
Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for
float `<1e-15` for double),
so it may not be a serious problem (?)
Please post an issue or PR if you have any comments.
================================================
FILE: external/DCNv2/dcn_v2.py
================================================
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import math
import torch
from torch import nn
from torch.autograd import Function
from torch.nn.modules.utils import _pair
from torch.autograd.function import once_differentiable
import _ext as _backend
class _DCNv2(Function):
@staticmethod
def forward(ctx, input, offset, mask, weight, bias,
stride, padding, dilation, deformable_groups):
ctx.stride = _pair(stride)
ctx.padding = _pair(padding)
ctx.dilation = _pair(dilation)
ctx.kernel_size = _pair(weight.shape[2:4])
ctx.deformable_groups = deformable_groups
output = _backend.dcn_v2_forward(input, weight, bias,
offset, mask,
ctx.kernel_size[0], ctx.kernel_size[1],
ctx.stride[0], ctx.stride[1],
ctx.padding[0], ctx.padding[1],
ctx.dilation[0], ctx.dilation[1],
ctx.deformable_groups)
ctx.save_for_backward(input, offset, mask, weight, bias)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
input, offset, mask, weight, bias = ctx.saved_tensors
grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \
_backend.dcn_v2_backward(input, weight,
bias,
offset, mask,
grad_output,
ctx.kernel_size[0], ctx.kernel_size[1],
ctx.stride[0], ctx.stride[1],
ctx.padding[0], ctx.padding[1],
ctx.dilation[0], ctx.dilation[1],
ctx.deformable_groups)
return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\
None, None, None, None,
dcn_v2_conv = _DCNv2.apply
class DCNv2(nn.Module):
def __init__(self, in_channels, out_channels,
kernel_size, stride, padding, dilation=1, deformable_groups=1):
super(DCNv2, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = _pair(kernel_size)
self.stride = _pair(stride)
self.padding = _pair(padding)
self.dilation = _pair(dilation)
self.deformable_groups = deformable_groups
self.weight = nn.Parameter(torch.Tensor(
out_channels, in_channels, *self.kernel_size))
self.bias = nn.Parameter(torch.Tensor(out_channels))
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
for k in self.kernel_size:
n *= k
stdv = 1. / math.sqrt(n)
self.weight.data.uniform_(-stdv, stdv)
self.bias.data.zero_()
def forward(self, input, offset, mask):
assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
offset.shape[1]
assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
mask.shape[1]
return dcn_v2_conv(input, offset, mask,
self.weight,
self.bias,
self.stride,
self.padding,
self.dilation,
self.deformable_groups)
class DCN(DCNv2):
def __init__(self, in_channels, out_channels,
kernel_size, stride, padding,
dilation=1, deformable_groups=1):
super(DCN, self).__init__(in_channels, out_channels,
kernel_size, stride, padding, dilation, deformable_groups)
channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]
self.conv_offset_mask = nn.Conv2d(self.in_channels,
channels_,
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
bias=True)
self.init_offset()
def init_offset(self):
self.conv_offset_mask.weight.data.zero_()
self.conv_offset_mask.bias.data.zero_()
def forward(self, input):
out = self.conv_offset_mask(input)
o1, o2, mask = torch.chunk(out, 3, dim=1)
offset = torch.cat((o1, o2), dim=1)
mask = torch.sigmoid(mask)
return dcn_v2_conv(input, offset, mask,
self.weight, self.bias,
self.stride,
self.padding,
self.dilation,
self.deformable_groups)
class _DCNv2Pooling(Function):
@staticmethod
def forward(ctx, input, rois, offset,
spatial_scale,
pooled_size,
output_dim,
no_trans,
group_size=1,
part_size=None,
sample_per_part=4,
trans_std=.0):
ctx.spatial_scale = spatial_scale
ctx.no_trans = int(no_trans)
ctx.output_dim = output_dim
ctx.group_size = group_size
ctx.pooled_size = pooled_size
ctx.part_size = pooled_size if part_size is None else part_size
ctx.sample_per_part = sample_per_part
ctx.trans_std = trans_std
output, output_count = \
_backend.dcn_v2_psroi_pooling_forward(input, rois, offset,
ctx.no_trans, ctx.spatial_scale,
ctx.output_dim, ctx.group_size,
ctx.pooled_size, ctx.part_size,
ctx.sample_per_part, ctx.trans_std)
ctx.save_for_backward(input, rois, offset, output_count)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
input, rois, offset, output_count = ctx.saved_tensors
grad_input, grad_offset = \
_backend.dcn_v2_psroi_pooling_backward(grad_output,
input,
rois,
offset,
output_count,
ctx.no_trans,
ctx.spatial_scale,
ctx.output_dim,
ctx.group_size,
ctx.pooled_size,
ctx.part_size,
ctx.sample_per_part,
ctx.trans_std)
return grad_input, None, grad_offset, \
None, None, None, None, None, None, None, None
dcn_v2_pooling = _DCNv2Pooling.apply
class DCNv2Pooling(nn.Module):
def __init__(self,
spatial_scale,
pooled_size,
output_dim,
no_trans,
group_size=1,
part_size=None,
sample_per_part=4,
trans_std=.0):
super(DCNv2Pooling, self).__init__()
self.spatial_scale = spatial_scale
self.pooled_size = pooled_size
self.output_dim = output_dim
self.no_trans = no_trans
self.group_size = group_size
self.part_size = pooled_size if part_size is None else part_size
self.sample_per_part = sample_per_part
self.trans_std = trans_std
def forward(self, input, rois, offset):
assert input.shape[1] == self.output_dim
if self.no_trans:
offset = input.new()
return dcn_v2_pooling(input, rois, offset,
self.spatial_scale,
self.pooled_size,
self.output_dim,
self.no_trans,
self.group_size,
self.part_size,
self.sample_per_part,
self.trans_std)
class DCNPooling(DCNv2Pooling):
def __init__(self,
spatial_scale,
pooled_size,
output_dim,
no_trans,
group_size=1,
part_size=None,
sample_per_part=4,
trans_std=.0,
deform_fc_dim=1024):
super(DCNPooling, self).__init__(spatial_scale,
pooled_size,
output_dim,
no_trans,
group_size,
part_size,
sample_per_part,
trans_std)
self.deform_fc_dim = deform_fc_dim
if not no_trans:
self.offset_mask_fc = nn.Sequential(
nn.Linear(self.pooled_size * self.pooled_size *
self.output_dim, self.deform_fc_dim),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_dim, self.deform_fc_dim),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_dim, self.pooled_size *
self.pooled_size * 3)
)
self.offset_mask_fc[4].weight.data.zero_()
self.offset_mask_fc[4].bias.data.zero_()
def forward(self, input, rois):
offset = input.new()
if not self.no_trans:
# do roi_align first
n = rois.shape[0]
roi = dcn_v2_pooling(input, rois, offset,
self.spatial_scale,
self.pooled_size,
self.output_dim,
True, # no trans
self.group_size,
self.part_size,
self.sample_per_part,
self.trans_std)
# build mask and offset
offset_mask = self.offset_mask_fc(roi.view(n, -1))
offset_mask = offset_mask.view(
n, 3, self.pooled_size, self.pooled_size)
o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
offset = torch.cat((o1, o2), dim=1)
mask = torch.sigmoid(mask)
# do pooling with offset and mask
return dcn_v2_pooling(input, rois, offset,
self.spatial_scale,
self.pooled_size,
self.output_dim,
self.no_trans,
self.group_size,
self.part_size,
self.sample_per_part,
self.trans_std) * mask
# only roi_align
return dcn_v2_pooling(input, rois, offset,
self.spatial_scale,
self.pooled_size,
self.output_dim,
self.no_trans,
self.group_size,
self.part_size,
self.sample_per_part,
self.trans_std)
================================================
FILE: external/DCNv2/setup.py
================================================
#!/usr/bin/env python
import os
import glob
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
if torch.cuda.is_available() and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
raise NotImplementedError('Cuda is not available')
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"_ext",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="DCNv2",
version="0.1",
author="charlesshang",
url="https://github.com/charlesshang/DCNv2",
description="deformable convolutional networks",
packages=find_packages(exclude=("configs", "tests",)),
# install_requires=requirements,
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)
================================================
FILE: external/DCNv2/src/cpu/dcn_v2_cpu.cpp
================================================
#include
#include
#include
at::Tensor
dcn_v2_cpu_forward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int deformable_group)
{
AT_ERROR("Not implement on cpu");
}
std::vector
dcn_v2_cpu_backward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const at::Tensor &grad_output,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int pad_h, int pad_w,
int dilation_h, int dilation_w,
int deformable_group)
{
AT_ERROR("Not implement on cpu");
}
std::tuple
dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
AT_ERROR("Not implement on cpu");
}
std::tuple
dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const at::Tensor &top_count,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
AT_ERROR("Not implement on cpu");
}
================================================
FILE: external/DCNv2/src/cpu/vision.h
================================================
#pragma once
#include
at::Tensor
dcn_v2_cpu_forward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int deformable_group);
std::vector
dcn_v2_cpu_backward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const at::Tensor &grad_output,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int pad_h, int pad_w,
int dilation_h, int dilation_w,
int deformable_group);
std::tuple
dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std);
std::tuple
dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const at::Tensor &top_count,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std);
================================================
FILE: external/DCNv2/src/cuda/dcn_v2_cuda.cu
================================================
#include
#include "cuda/dcn_v2_im2col_cuda.h"
#include
#include
#include
#include
#include
THCState *state = at::globalContext().lazyInitCUDA();
// author: Charles Shang
// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
// [batch gemm]
// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
__global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
float **columns_b, const float **ones_b,
const float **weight_b, const float **bias_b,
float *input, float *output,
float *columns, float *ones,
float *weight, float *bias,
const int input_stride, const int output_stride,
const int columns_stride, const int ones_stride,
const int num_batches)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_batches)
{
input_b[idx] = input + idx * input_stride;
output_b[idx] = output + idx * output_stride;
columns_b[idx] = columns + idx * columns_stride;
ones_b[idx] = ones + idx * ones_stride;
// share weights and bias within a Mini-Batch
weight_b[idx] = weight;
bias_b[idx] = bias;
}
}
at::Tensor
dcn_v2_cuda_forward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int deformable_group)
{
using scalar_t = float;
// THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.size(0);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
// printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
// printf("Channels: %d %d\n", channels, channels_kernel);
// printf("Channels: %d %d\n", channels_out, channels_kernel);
AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
AT_ASSERTM(channels == channels_kernel,
"Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
auto ones = at::ones({batch, height_out, width_out}, input.options());
auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
// prepare for batch-wise computing, which is significantly faster than instance-wise computing
// when batch size is large.
// launch batch threads
int matrices_size = batch * sizeof(float *);
auto input_b = static_cast(THCudaMalloc(state, matrices_size));
auto output_b = static_cast(THCudaMalloc(state, matrices_size));
auto columns_b = static_cast(THCudaMalloc(state, matrices_size));
auto ones_b = static_cast(THCudaMalloc(state, matrices_size));
auto weight_b = static_cast(THCudaMalloc(state, matrices_size));
auto bias_b = static_cast(THCudaMalloc(state, matrices_size));
const int block = 128;
const int grid = (batch + block - 1) / block;
createBatchGemmBuffer<<>>(
input_b, output_b,
columns_b, ones_b,
weight_b, bias_b,
input.data(),
output.data(),
columns.data(),
ones.data(),
weight.data(),
bias.data(),
channels * width * height,
channels_out * width_out * height_out,
channels * kernel_h * kernel_w * height_out * width_out,
height_out * width_out,
batch);
long m_ = channels_out;
long n_ = height_out * width_out;
long k_ = 1;
THCudaBlas_SgemmBatched(state,
't',
'n',
n_,
m_,
k_,
1.0f,
ones_b, k_,
bias_b, k_,
0.0f,
output_b, n_,
batch);
modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
input.data(),
offset.data(),
mask.data(),
batch, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
deformable_group,
columns.data());
long m = channels_out;
long n = height_out * width_out;
long k = channels * kernel_h * kernel_w;
THCudaBlas_SgemmBatched(state,
'n',
'n',
n,
m,
k,
1.0f,
(const float **)columns_b, n,
weight_b, k,
1.0f,
output_b, n,
batch);
THCudaFree(state, input_b);
THCudaFree(state, output_b);
THCudaFree(state, columns_b);
THCudaFree(state, ones_b);
THCudaFree(state, weight_b);
THCudaFree(state, bias_b);
return output;
}
__global__ void createBatchGemmBufferBackward(
float **grad_output_b,
float **columns_b,
float **ones_b,
float **weight_b,
float **grad_weight_b,
float **grad_bias_b,
float *grad_output,
float *columns,
float *ones,
float *weight,
float *grad_weight,
float *grad_bias,
const int grad_output_stride,
const int columns_stride,
const int ones_stride,
const int num_batches)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_batches)
{
grad_output_b[idx] = grad_output + idx * grad_output_stride;
columns_b[idx] = columns + idx * columns_stride;
ones_b[idx] = ones + idx * ones_stride;
// share weights and bias within a Mini-Batch
weight_b[idx] = weight;
grad_weight_b[idx] = grad_weight;
grad_bias_b[idx] = grad_bias;
}
}
std::vector dcn_v2_cuda_backward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const at::Tensor &grad_output,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int pad_h, int pad_w,
int dilation_h, int dilation_w,
int deformable_group)
{
THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.size(0);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
AT_ASSERTM(channels == channels_kernel,
"Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
auto ones = at::ones({height_out, width_out}, input.options());
auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
auto grad_input = at::zeros_like(input);
auto grad_weight = at::zeros_like(weight);
auto grad_bias = at::zeros_like(bias);
auto grad_offset = at::zeros_like(offset);
auto grad_mask = at::zeros_like(mask);
using scalar_t = float;
for (int b = 0; b < batch; b++)
{
auto input_n = input.select(0, b);
auto offset_n = offset.select(0, b);
auto mask_n = mask.select(0, b);
auto grad_output_n = grad_output.select(0, b);
auto grad_input_n = grad_input.select(0, b);
auto grad_offset_n = grad_offset.select(0, b);
auto grad_mask_n = grad_mask.select(0, b);
long m = channels * kernel_h * kernel_w;
long n = height_out * width_out;
long k = channels_out;
THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
grad_output_n.data(), n,
weight.data(), m, 0.0f,
columns.data(), n);
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state),
columns.data(),
input_n.data(),
offset_n.data(),
mask_n.data(),
1, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group,
grad_offset_n.data(),
grad_mask_n.data());
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(THCState_getCurrentStream(state),
columns.data(),
offset_n.data(),
mask_n.data(),
1, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group,
grad_input_n.data());
// gradient w.r.t. weight, dWeight should accumulate across the batch and group
modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
input_n.data(),
offset_n.data(),
mask_n.data(),
1, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group,
columns.data());
long m_ = channels_out;
long n_ = channels * kernel_h * kernel_w;
long k_ = height_out * width_out;
THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
columns.data(), k_,
grad_output_n.data(), k_, 1.0f,
grad_weight.data(), n_);
// gradient w.r.t. bias
// long m_ = channels_out;
// long k__ = height_out * width_out;
THCudaBlas_Sgemv(state,
't',
k_, m_, 1.0f,
grad_output_n.data(), k_,
ones.data(), 1, 1.0f,
grad_bias.data(), 1);
}
return {
grad_input, grad_offset, grad_mask, grad_weight, grad_bias
};
}
================================================
FILE: external/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu
================================================
#include "dcn_v2_im2col_cuda.h"
#include
#include
#include
#include
#include
#include
#include
#include
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N)
{
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
__device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
const int height, const int width, float h, float w)
{
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
float lh = h - h_low;
float lw = w - w_low;
float hh = 1 - lh, hw = 1 - lw;
float v1 = 0;
if (h_low >= 0 && w_low >= 0)
v1 = bottom_data[h_low * data_width + w_low];
float v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
float v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
float v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
__device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w,
const int h, const int w, const int height, const int width)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
float weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
__device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
const int height, const int width, const float *im_data,
const int data_width, const int bp_dir)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
float weight = 0;
if (bp_dir == 0)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
else if (bp_dir == 1)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
const float *data_im, const float *data_offset, const float *data_mask,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int num_channels, const int deformable_group,
const int height_col, const int width_col,
float *data_col)
{
// launch channels * batch_size * height_col * width_col cores
CUDA_KERNEL_LOOP(index, n)
{
// NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
// here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
// const int b_col = (index / width_col / height_col) % batch_size;
const int b_col = (index / width_col / height_col / num_channels) % batch_size;
// const int c_im = (index / width_col / height_col) / batch_size;
const int c_im = (index / width_col / height_col) % num_channels;
// const int c_col = c_im * kernel_h * kernel_w;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
// float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
//const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i)
{
for (int j = 0; j < kernel_w; ++j)
{
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const float offset_h = data_offset_ptr[data_offset_h_ptr];
const float offset_w = data_offset_ptr[data_offset_w_ptr];
const float mask = data_mask_ptr[data_mask_hw_ptr];
float val = static_cast(0);
const float h_im = h_in + i * dilation_h + offset_h;
const float w_im = w_in + j * dilation_w + offset_w;
//if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
{
//const float map_h = i * dilation_h + offset_h;
//const float map_w = j * dilation_w + offset_w;
//const int cur_height = height - h_in;
//const int cur_width = width - w_in;
//val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
// data_col_ptr += batch_size * height_col * width_col;
data_col_ptr += height_col * width_col;
}
}
}
}
__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
const float *data_col, const float *data_offset, const float *data_mask,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int deformable_group,
const int height_col, const int width_col,
float *grad_im)
{
CUDA_KERNEL_LOOP(index, n)
{
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const float offset_h = data_offset_ptr[data_offset_h_ptr];
const float offset_w = data_offset_ptr[data_offset_w_ptr];
const float mask = data_mask_ptr[data_mask_hw_ptr];
const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
const float cur_top_grad = data_col[index] * mask;
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++)
{
for (int dx = -2; dx <= 2; dx++)
{
if (cur_h + dy >= 0 && cur_h + dy < height &&
cur_w + dx >= 0 && cur_w + dx < width &&
abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1)
{
int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
const float *data_col, const float *data_im,
const float *data_offset, const float *data_mask,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col,
float *grad_offset, float *grad_mask)
{
CUDA_KERNEL_LOOP(index, n)
{
float val = 0, mval = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
{
const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const float offset_h = data_offset_ptr[data_offset_h_ptr];
const float offset_w = data_offset_ptr[data_offset_w_ptr];
const float mask = data_mask_ptr[data_mask_hw_ptr];
float inv_h = h_in + i * dilation_h + offset_h;
float inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
{
inv_h = inv_w = -2;
}
else
{
mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
}
const float weight = dmcn_get_coordinate_weight(
inv_h, inv_w,
height, width, data_im_ptr + cnt * height * width, width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset[index] = val;
if (offset_c % 2 == 0)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
}
}
void modulated_deformable_im2col_cuda(cudaStream_t stream,
const float* data_im, const float* data_offset, const float* data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, float* data_col) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
modulated_deformable_im2col_gpu_kernel
<<>>(
num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, channels, deformable_group, height_col, width_col, data_col);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_cuda(cudaStream_t stream,
const float* data_col, const float* data_offset, const float* data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, float* grad_im){
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
modulated_deformable_col2im_gpu_kernel
<<>>(
num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
batch_size, deformable_group, height_col, width_col, grad_im);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group,
float* grad_offset, float* grad_mask) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
modulated_deformable_col2im_coord_gpu_kernel
<<>>(
num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
grad_offset, grad_mask);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: external/DCNv2/src/cuda/dcn_v2_im2col_cuda.h
================================================
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer ********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.h
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1811.11168
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
*/
/***************** Adapted by Charles Shang *********************/
#ifndef DCN_V2_IM2COL_CUDA
#define DCN_V2_IM2COL_CUDA
#ifdef __cplusplus
extern "C"
{
#endif
void modulated_deformable_im2col_cuda(cudaStream_t stream,
const float *data_im, const float *data_offset, const float *data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, float *data_col);
void modulated_deformable_col2im_cuda(cudaStream_t stream,
const float *data_col, const float *data_offset, const float *data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, float *grad_im);
void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group,
float *grad_offset, float *grad_mask);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: external/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu
================================================
/*!
* Copyright (c) 2017 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file deformable_psroi_pooling.cu
* \brief
* \author Yi Li, Guodong Zhang, Jifeng Dai
*/
/***************** Adapted by Charles Shang *********************/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N)
{
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template
__device__ T bilinear_interp(
const T *data,
const T x,
const T y,
const int width,
const int height)
{
int x1 = floor(x);
int x2 = ceil(x);
int y1 = floor(y);
int y2 = ceil(y);
T dist_x = static_cast(x - x1);
T dist_y = static_cast(y - y1);
T value11 = data[y1 * width + x1];
T value12 = data[y2 * width + x1];
T value21 = data[y1 * width + x2];
T value22 = data[y2 * width + x2];
T value = (1 - dist_x) * (1 - dist_y) * value11 +
(1 - dist_x) * dist_y * value12 +
dist_x * (1 - dist_y) * value21 +
dist_x * dist_y * value22;
return value;
}
template
__global__ void DeformablePSROIPoolForwardKernel(
const int count,
const T *bottom_data,
const T spatial_scale,
const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const T *bottom_rois, const T *bottom_trans,
const int no_trans,
const T trans_std,
const int sample_per_part,
const int output_dim,
const int group_size,
const int part_size,
const int num_classes,
const int channels_each_class,
T *top_data,
T *top_count)
{
CUDA_KERNEL_LOOP(index, count)
{
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const T *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
// Force too small ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
T roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
T bin_size_h = roi_height / static_cast(pooled_height);
T bin_size_w = roi_width / static_cast(pooled_width);
T sub_bin_size_h = bin_size_h / static_cast(sample_per_part);
T sub_bin_size_w = bin_size_w / static_cast(sample_per_part);
int part_h = floor(static_cast(ph) / pooled_height * part_size);
int part_w = floor(static_cast(pw) / pooled_width * part_size);
int class_id = ctop / channels_each_class;
T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
T wstart = static_cast(pw) * bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
T hstart = static_cast(ph) * bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
T sum = 0;
int count = 0;
int gw = floor(static_cast(pw) * group_size / pooled_width);
int gh = floor(static_cast(ph) * group_size / pooled_height);
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
for (int ih = 0; ih < sample_per_part; ih++)
{
for (int iw = 0; iw < sample_per_part; iw++)
{
T w = wstart + iw * sub_bin_size_w;
T h = hstart + ih * sub_bin_size_h;
// bilinear interpolation
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
{
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_size + gh) * group_size + gw;
T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
sum += val;
count++;
}
}
top_data[index] = count == 0 ? static_cast(0) : sum / count;
top_count[index] = count;
}
}
template
__global__ void DeformablePSROIPoolBackwardAccKernel(
const int count,
const T *top_diff,
const T *top_count,
const int num_rois,
const T spatial_scale,
const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int output_dim,
T *bottom_data_diff, T *bottom_trans_diff,
const T *bottom_data,
const T *bottom_rois,
const T *bottom_trans,
const int no_trans,
const T trans_std,
const int sample_per_part,
const int group_size,
const int part_size,
const int num_classes,
const int channels_each_class)
{
CUDA_KERNEL_LOOP(index, count)
{
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const T *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
// Force too small ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
T roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
T bin_size_h = roi_height / static_cast(pooled_height);
T bin_size_w = roi_width / static_cast(pooled_width);
T sub_bin_size_h = bin_size_h / static_cast(sample_per_part);
T sub_bin_size_w = bin_size_w / static_cast(sample_per_part);
int part_h = floor(static_cast(ph) / pooled_height * part_size);
int part_w = floor(static_cast(pw) / pooled_width * part_size);
int class_id = ctop / channels_each_class;
T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
T wstart = static_cast(pw) * bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
T hstart = static_cast(ph) * bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
if (top_count[index] <= 0)
{
continue;
}
T diff_val = top_diff[index] / top_count[index];
const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
int gw = floor(static_cast(pw) * group_size / pooled_width);
int gh = floor(static_cast(ph) * group_size / pooled_height);
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
for (int ih = 0; ih < sample_per_part; ih++)
{
for (int iw = 0; iw < sample_per_part; iw++)
{
T w = wstart + iw * sub_bin_size_w;
T h = hstart + ih * sub_bin_size_h;
// bilinear interpolation
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
{
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_size + gh) * group_size + gw;
// backward on feature
int x0 = floor(w);
int x1 = ceil(w);
int y0 = floor(h);
int y1 = ceil(h);
T dist_x = w - x0, dist_y = h - y0;
T q00 = (1 - dist_x) * (1 - dist_y);
T q01 = (1 - dist_x) * dist_y;
T q10 = dist_x * (1 - dist_y);
T q11 = dist_x * dist_y;
int bottom_index_base = c * height * width;
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
if (no_trans)
{
continue;
}
T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
diff_x *= roi_width;
T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
diff_y *= roi_height;
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
}
}
}
}
std::tuple
dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_trans = no_trans ? 2 : trans.size(1);
const int num_bbox = bbox.size(0);
AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
auto pooled_height = pooled_size;
auto pooled_width = pooled_size;
auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
long out_size = num_bbox * output_dim * pooled_height * pooled_width;
auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (out.numel() == 0)
{
THCudaCheck(cudaGetLastError());
return std::make_tuple(out, top_count);
}
dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
dim3 block(512);
AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
DeformablePSROIPoolForwardKernel<<>>(
out_size,
input.contiguous().data(),
spatial_scale,
channels,
height, width,
pooled_height,
pooled_width,
bbox.contiguous().data(),
trans.contiguous().data(),
no_trans,
trans_std,
sample_per_part,
output_dim,
group_size,
part_size,
num_classes,
channels_each_class,
out.data(),
top_count.data());
});
THCudaCheck(cudaGetLastError());
return std::make_tuple(out, top_count);
}
std::tuple
dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const at::Tensor &top_count,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_trans = no_trans ? 2 : trans.size(1);
const int num_bbox = bbox.size(0);
AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
auto pooled_height = pooled_size;
auto pooled_width = pooled_size;
long out_size = num_bbox * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
auto trans_grad = at::zeros_like(trans);
if (input_grad.numel() == 0)
{
THCudaCheck(cudaGetLastError());
return std::make_tuple(input_grad, trans_grad);
}
dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
dim3 block(512);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
DeformablePSROIPoolBackwardAccKernel<<>>(
out_size,
out_grad.contiguous().data(),
top_count.contiguous().data(),
num_bbox,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
output_dim,
input_grad.contiguous().data(),
trans_grad.contiguous().data(),
input.contiguous().data(),
bbox.contiguous().data(),
trans.contiguous().data(),
no_trans,
trans_std,
sample_per_part,
group_size,
part_size,
num_classes,
channels_each_class);
});
THCudaCheck(cudaGetLastError());
return std::make_tuple(input_grad, trans_grad);
}
================================================
FILE: external/DCNv2/src/cuda/vision.h
================================================
#pragma once
#include
at::Tensor
dcn_v2_cuda_forward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int deformable_group);
std::vector
dcn_v2_cuda_backward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const at::Tensor &grad_output,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int pad_h, int pad_w,
int dilation_h, int dilation_w,
int deformable_group);
std::tuple
dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std);
std::tuple
dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const at::Tensor &top_count,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std);
================================================
FILE: external/DCNv2/src/dcn_v2.h
================================================
#pragma once
#include "cpu/vision.h"
#ifdef WITH_CUDA
#include "cuda/vision.h"
#endif
at::Tensor
dcn_v2_forward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int deformable_group)
{
if (input.type().is_cuda())
{
#ifdef WITH_CUDA
return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
kernel_h, kernel_w,
stride_h, stride_w,
pad_h, pad_w,
dilation_h, dilation_w,
deformable_group);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::vector
dcn_v2_backward(const at::Tensor &input,
const at::Tensor &weight,
const at::Tensor &bias,
const at::Tensor &offset,
const at::Tensor &mask,
const at::Tensor &grad_output,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int pad_h, int pad_w,
int dilation_h, int dilation_w,
int deformable_group)
{
if (input.type().is_cuda())
{
#ifdef WITH_CUDA
return dcn_v2_cuda_backward(input,
weight,
bias,
offset,
mask,
grad_output,
kernel_h, kernel_w,
stride_h, stride_w,
pad_h, pad_w,
dilation_h, dilation_w,
deformable_group);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::tuple
dcn_v2_psroi_pooling_forward(const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
if (input.type().is_cuda())
{
#ifdef WITH_CUDA
return dcn_v2_psroi_pooling_cuda_forward(input,
bbox,
trans,
no_trans,
spatial_scale,
output_dim,
group_size,
pooled_size,
part_size,
sample_per_part,
trans_std);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::tuple
dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
const at::Tensor &input,
const at::Tensor &bbox,
const at::Tensor &trans,
const at::Tensor &top_count,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
if (input.type().is_cuda())
{
#ifdef WITH_CUDA
return dcn_v2_psroi_pooling_cuda_backward(out_grad,
input,
bbox,
trans,
top_count,
no_trans,
spatial_scale,
output_dim,
group_size,
pooled_size,
part_size,
sample_per_part,
trans_std);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
================================================
FILE: external/DCNv2/src/vision.cpp
================================================
#include "dcn_v2.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
}
================================================
FILE: external/DCNv2/test.py
================================================
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import time
import torch
import torch.nn as nn
from torch.autograd import gradcheck
from dcn_v2 import dcn_v2_conv, DCNv2, DCN
from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling
deformable_groups = 1
N, inC, inH, inW = 2, 2, 4, 4
outC = 2
kH, kW = 3, 3
def conv_identify(weight, bias):
weight.data.zero_()
bias.data.zero_()
o, i, h, w = weight.shape
y = h//2
x = w//2
for p in range(i):
for q in range(o):
if p == q:
weight.data[q, p, y, x] = 1.0
def check_zero_offset():
conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
kernel_size=(kH, kW),
stride=(1, 1),
padding=(1, 1),
bias=True).cuda()
conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
kernel_size=(kH, kW),
stride=(1, 1),
padding=(1, 1),
bias=True).cuda()
dcn_v2 = DCNv2(inC, outC, (kH, kW),
stride=1, padding=1, dilation=1,
deformable_groups=deformable_groups).cuda()
conv_offset.weight.data.zero_()
conv_offset.bias.data.zero_()
conv_mask.weight.data.zero_()
conv_mask.bias.data.zero_()
conv_identify(dcn_v2.weight, dcn_v2.bias)
input = torch.randn(N, inC, inH, inW).cuda()
offset = conv_offset(input)
mask = conv_mask(input)
mask = torch.sigmoid(mask)
output = dcn_v2(input, offset, mask)
output *= 2
d = (input - output).abs().max()
if d < 1e-10:
print('Zero offset passed')
else:
print('Zero offset failed')
print(input)
print(output)
def check_gradient_dconv():
input = torch.rand(N, inC, inH, inW).cuda() * 0.01
input.requires_grad = True
offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2
# offset.data.zero_()
# offset.data -= 0.5
offset.requires_grad = True
mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda()
# mask.data.zero_()
mask.requires_grad = True
mask = torch.sigmoid(mask)
weight = torch.randn(outC, inC, kH, kW).cuda()
weight.requires_grad = True
bias = torch.rand(outC).cuda()
bias.requires_grad = True
stride = 1
padding = 1
dilation = 1
print('check_gradient_dconv: ',
gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias,
stride, padding, dilation, deformable_groups),
eps=1e-3, atol=1e-4, rtol=1e-2))
def check_pooling_zero_offset():
input = torch.randn(2, 16, 64, 64).cuda().zero_()
input[0, :, 16:26, 16:26] = 1.
input[1, :, 10:20, 20:30] = 2.
rois = torch.tensor([
[0, 65, 65, 103, 103],
[1, 81, 41, 119, 79],
]).cuda().float()
pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
pooled_size=7,
output_dim=16,
no_trans=True,
group_size=1,
trans_std=0.0).cuda()
out = pooling(input, rois, input.new())
s = ', '.join(['%f' % out[i, :, :, :].mean().item()
for i in range(rois.shape[0])])
print(s)
dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
pooled_size=7,
output_dim=16,
no_trans=False,
group_size=1,
trans_std=0.0).cuda()
offset = torch.randn(20, 2, 7, 7).cuda().zero_()
dout = dpooling(input, rois, offset)
s = ', '.join(['%f' % dout[i, :, :, :].mean().item()
for i in range(rois.shape[0])])
print(s)
def check_gradient_dpooling():
input = torch.randn(2, 3, 5, 5).cuda() * 0.01
N = 4
batch_inds = torch.randint(2, (N, 1)).cuda().float()
x = torch.rand((N, 1)).cuda().float() * 15
y = torch.rand((N, 1)).cuda().float() * 15
w = torch.rand((N, 1)).cuda().float() * 10
h = torch.rand((N, 1)).cuda().float() * 10
rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
offset = torch.randn(N, 2, 3, 3).cuda()
input.requires_grad = True
offset.requires_grad = True
spatial_scale = 1.0 / 4
pooled_size = 3
output_dim = 3
no_trans = 0
group_size = 1
trans_std = 0.0
sample_per_part = 4
part_size = pooled_size
print('check_gradient_dpooling:',
gradcheck(dcn_v2_pooling, (input, rois, offset,
spatial_scale,
pooled_size,
output_dim,
no_trans,
group_size,
part_size,
sample_per_part,
trans_std),
eps=1e-4))
def example_dconv():
input = torch.randn(2, 64, 128, 128).cuda()
# wrap all things (offset and mask) in DCN
dcn = DCN(64, 64, kernel_size=(3, 3), stride=1,
padding=1, deformable_groups=2).cuda()
# print(dcn.weight.shape, input.shape)
output = dcn(input)
targert = output.new(*output.size())
targert.data.uniform_(-0.01, 0.01)
error = (targert - output).mean()
error.backward()
print(output.shape)
def example_dpooling():
input = torch.randn(2, 32, 64, 64).cuda()
batch_inds = torch.randint(2, (20, 1)).cuda().float()
x = torch.randint(256, (20, 1)).cuda().float()
y = torch.randint(256, (20, 1)).cuda().float()
w = torch.randint(64, (20, 1)).cuda().float()
h = torch.randint(64, (20, 1)).cuda().float()
rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
offset = torch.randn(20, 2, 7, 7).cuda()
input.requires_grad = True
offset.requires_grad = True
# normal roi_align
pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
pooled_size=7,
output_dim=32,
no_trans=True,
group_size=1,
trans_std=0.1).cuda()
# deformable pooling
dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
pooled_size=7,
output_dim=32,
no_trans=False,
group_size=1,
trans_std=0.1).cuda()
out = pooling(input, rois, offset)
dout = dpooling(input, rois, offset)
print(out.shape)
print(dout.shape)
target_out = out.new(*out.size())
target_out.data.uniform_(-0.01, 0.01)
target_dout = dout.new(*dout.size())
target_dout.data.uniform_(-0.01, 0.01)
e = (target_out - out).mean()
e.backward()
e = (target_dout - dout).mean()
e.backward()
def example_mdpooling():
input = torch.randn(2, 32, 64, 64).cuda()
input.requires_grad = True
batch_inds = torch.randint(2, (20, 1)).cuda().float()
x = torch.randint(256, (20, 1)).cuda().float()
y = torch.randint(256, (20, 1)).cuda().float()
w = torch.randint(64, (20, 1)).cuda().float()
h = torch.randint(64, (20, 1)).cuda().float()
rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
# mdformable pooling (V2)
dpooling = DCNPooling(spatial_scale=1.0 / 4,
pooled_size=7,
output_dim=32,
no_trans=False,
group_size=1,
trans_std=0.1,
deform_fc_dim=1024).cuda()
dout = dpooling(input, rois)
target = dout.new(*dout.size())
target.data.uniform_(-0.1, 0.1)
error = (target - dout).mean()
error.backward()
print(dout.shape)
if __name__ == '__main__':
example_dconv()
example_dpooling()
example_mdpooling()
check_pooling_zero_offset()
# zero offset check
if inC == outC:
check_zero_offset()
check_gradient_dpooling()
check_gradient_dconv()
# """
# ****** Note: backward is not reentrant error may not be a serious problem,
# ****** since the max error is less than 1e-7,
# ****** Still looking for what trigger this problem
# """
================================================
FILE: layers/__init__.py
================================================
from .functions import *
from .modules import *
================================================
FILE: layers/box_utils.py
================================================
# -*- coding: utf-8 -*-
import torch
from utils import timer
from data import cfg
@torch.jit.script
def point_form(boxes):
""" Convert prior_boxes to (xmin, ymin, xmax, ymax)
representation for comparison to point form ground truth data.
Args:
boxes: (tensor) center-size default boxes from priorbox layers.
Return:
boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
"""
return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin
boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax
@torch.jit.script
def center_size(boxes):
""" Convert prior_boxes to (cx, cy, w, h)
representation for comparison to center-size form ground truth data.
Args:
boxes: (tensor) point_form boxes
Return:
boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
"""
return torch.cat(( (boxes[:, 2:] + boxes[:, :2])/2, # cx, cy
boxes[:, 2:] - boxes[:, :2] ), 1) # w, h
@torch.jit.script
def intersect(box_a, box_b):
""" We resize both tensors to [A,B,2] without new malloc:
[A,2] -> [A,1,2] -> [A,B,2]
[B,2] -> [1,B,2] -> [A,B,2]
Then we compute the area of intersect between box_a and box_b.
Args:
box_a: (tensor) bounding boxes, Shape: [n,A,4].
box_b: (tensor) bounding boxes, Shape: [n,B,4].
Return:
(tensor) intersection area, Shape: [n,A,B].
"""
n = box_a.size(0)
A = box_a.size(1)
B = box_b.size(1)
max_xy = torch.min(box_a[:, :, 2:].unsqueeze(2).expand(n, A, B, 2),
box_b[:, :, 2:].unsqueeze(1).expand(n, A, B, 2))
min_xy = torch.max(box_a[:, :, :2].unsqueeze(2).expand(n, A, B, 2),
box_b[:, :, :2].unsqueeze(1).expand(n, A, B, 2))
return torch.clamp(max_xy - min_xy, min=0).prod(3) # inter
def jaccard(box_a, box_b, iscrowd:bool=False):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes. Here we operate on
ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
Return:
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
"""
use_batch = True
if box_a.dim() == 2:
use_batch = False
box_a = box_a[None, ...]
box_b = box_b[None, ...]
inter = intersect(box_a, box_b)
area_a = ((box_a[:, :, 2]-box_a[:, :, 0]) *
(box_a[:, :, 3]-box_a[:, :, 1])).unsqueeze(2).expand_as(inter) # [A,B]
area_b = ((box_b[:, :, 2]-box_b[:, :, 0]) *
(box_b[:, :, 3]-box_b[:, :, 1])).unsqueeze(1).expand_as(inter) # [A,B]
union = area_a + area_b - inter
out = inter / area_a if iscrowd else inter / union
return out if use_batch else out.squeeze(0)
def diou(box_a, box_b, iscrowd:bool=False):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes. Here we operate on
ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
Return:
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
"""
use_batch = True
if box_a.dim() == 2:
use_batch = False
box_a = box_a[None, ...]
box_b = box_b[None, ...]
inter = intersect(box_a, box_b)
area_a = ((box_a[:, :, 2]-box_a[:, :, 0]) *
(box_a[:, :, 3]-box_a[:, :, 1])).unsqueeze(2).expand_as(inter) # [A,B]
area_b = ((box_b[:, :, 2]-box_b[:, :, 0]) *
(box_b[:, :, 3]-box_b[:, :, 1])).unsqueeze(1).expand_as(inter) # [A,B]
union = area_a + area_b - inter
x1 = ((box_a[:, :, 2]+box_a[:, :, 0]) / 2).unsqueeze(2).expand_as(inter)
y1 = ((box_a[:, :, 3]+box_a[:, :, 1]) / 2).unsqueeze(2).expand_as(inter)
x2 = ((box_b[:, :, 2]+box_b[:, :, 0]) / 2).unsqueeze(1).expand_as(inter)
y2 = ((box_b[:, :, 3]+box_b[:, :, 1]) / 2).unsqueeze(1).expand_as(inter)
t1 = box_a[:, :, 1].unsqueeze(2).expand_as(inter)
b1 = box_a[:, :, 3].unsqueeze(2).expand_as(inter)
l1 = box_a[:, :, 0].unsqueeze(2).expand_as(inter)
r1 = box_a[:, :, 2].unsqueeze(2).expand_as(inter)
t2 = box_b[:, :, 1].unsqueeze(1).expand_as(inter)
b2 = box_b[:, :, 3].unsqueeze(1).expand_as(inter)
l2 = box_b[:, :, 0].unsqueeze(1).expand_as(inter)
r2 = box_b[:, :, 2].unsqueeze(1).expand_as(inter)
cr = torch.max(r1, r2)
cl = torch.min(l1, l2)
ct = torch.min(t1, t2)
cb = torch.max(b1, b2)
D = (((x2 - x1)**2 + (y2 - y1)**2) / ((cr-cl)**2 + (cb-ct)**2 + 1e-7))
out = inter / area_a if iscrowd else inter / union - D ** 0.9
return out if use_batch else out.squeeze(0)
def distance(box_a, box_b, iscrowd:bool=False):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes. Here we operate on
ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
Return:
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
"""
use_batch = True
if box_a.dim() == 2:
use_batch = False
box_a = box_a[None, ...]
box_b = box_b[None, ...]
inter = intersect(box_a, box_b)
x1 = ((box_a[:, :, 2]+box_a[:, :, 0]) / 2).unsqueeze(2).expand_as(inter)
y1 = ((box_a[:, :, 3]+box_a[:, :, 1]) / 2).unsqueeze(2).expand_as(inter)
x2 = ((box_b[:, :, 2]+box_b[:, :, 0]) / 2).unsqueeze(1).expand_as(inter)
y2 = ((box_b[:, :, 3]+box_b[:, :, 1]) / 2).unsqueeze(1).expand_as(inter)
t1 = box_a[:, :, 1].unsqueeze(2).expand_as(inter)
b1 = box_a[:, :, 3].unsqueeze(2).expand_as(inter)
l1 = box_a[:, :, 0].unsqueeze(2).expand_as(inter)
r1 = box_a[:, :, 2].unsqueeze(2).expand_as(inter)
t2 = box_b[:, :, 1].unsqueeze(1).expand_as(inter)
b2 = box_b[:, :, 3].unsqueeze(1).expand_as(inter)
l2 = box_b[:, :, 0].unsqueeze(1).expand_as(inter)
r2 = box_b[:, :, 2].unsqueeze(1).expand_as(inter)
cr = torch.max(r1, r2)
cl = torch.min(l1, l2)
ct = torch.min(t1, t2)
cb = torch.max(b1, b2)
D = (((x2 - x1)**2 + (y2 - y1)**2) / ((cr-cl)**2 + (cb-ct)**2 + 1e-7))**0.6
out = D if iscrowd else D
return out if use_batch else out.squeeze(0)
def elemwise_box_iou(box_a, box_b):
""" Does the same as above but instead of pairwise, elementwise along the inner dimension. """
max_xy = torch.min(box_a[:, 2:], box_b[:, 2:])
min_xy = torch.max(box_a[:, :2], box_b[:, :2])
inter = torch.clamp((max_xy - min_xy), min=0)
inter = inter[:, 0] * inter[:, 1]
area_a = (box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])
area_b = (box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])
union = area_a + area_b - inter
union = torch.clamp(union, min=0.1)
# Return value is [n] for inputs [n, 4]
return torch.clamp(inter / union, max=1)
def mask_iou(masks_a, masks_b, iscrowd=False):
"""
Computes the pariwise mask IoU between two sets of masks of size [a, h, w] and [b, h, w].
The output is of size [a, b].
Wait I thought this was "box_utils", why am I putting this in here?
"""
masks_a = masks_a.view(masks_a.size(0), -1)
masks_b = masks_b.view(masks_b.size(0), -1)
intersection = masks_a @ masks_b.t()
area_a = masks_a.sum(dim=1).unsqueeze(1)
area_b = masks_b.sum(dim=1).unsqueeze(0)
return intersection / (area_a + area_b - intersection) if not iscrowd else intersection / area_a
def elemwise_mask_iou(masks_a, masks_b):
""" Does the same as above but instead of pairwise, elementwise along the outer dimension. """
masks_a = masks_a.view(-1, masks_a.size(-1))
masks_b = masks_b.view(-1, masks_b.size(-1))
intersection = (masks_a * masks_b).sum(dim=0)
area_a = masks_a.sum(dim=0)
area_b = masks_b.sum(dim=0)
# Return value is [n] for inputs [h, w, n]
return torch.clamp(intersection / torch.clamp(area_a + area_b - intersection, min=0.1), max=1)
def change(gt, priors):
"""
Compute the d_change metric proposed in Box2Pix:
https://lmb.informatik.uni-freiburg.de/Publications/2018/UB18/paper-box2pix.pdf
Input should be in point form (xmin, ymin, xmax, ymax).
Output is of shape [num_gt, num_priors]
Note this returns -change so it can be a drop in replacement for
"""
num_priors = priors.size(0)
num_gt = gt.size(0)
gt_w = (gt[:, 2] - gt[:, 0])[:, None].expand(num_gt, num_priors)
gt_h = (gt[:, 3] - gt[:, 1])[:, None].expand(num_gt, num_priors)
gt_mat = gt[:, None, :].expand(num_gt, num_priors, 4)
pr_mat = priors[None, :, :].expand(num_gt, num_priors, 4)
diff = gt_mat - pr_mat
diff[:, :, 0] /= gt_w
diff[:, :, 2] /= gt_w
diff[:, :, 1] /= gt_h
diff[:, :, 3] /= gt_h
return -torch.sqrt( (diff ** 2).sum(dim=2) )
def match(pos_thresh, neg_thresh, truths, priors, labels, crowd_boxes, loc_t, conf_t, idx_t, idx, loc_data):
"""Match each prior box with the ground truth box of the highest jaccard
overlap, encode the bounding boxes, then return the matched indices
corresponding to both confidence and location preds.
Args:
pos_thresh: (float) IoU > pos_thresh ==> positive.
neg_thresh: (float) IoU < neg_thresh ==> negative.
truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
labels: (tensor) All the class labels for the image, Shape: [num_obj].
crowd_boxes: (tensor) All the crowd box annotations or None if there are none.
loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. Note: -1 means neutral.
idx_t: (tensor) Tensor to be filled w/ the index of the matched gt box for each prior.
idx: (int) current batch index.
loc_data: (tensor) The predicted bbox regression coordinates for this batch.
Return:
The matched indices corresponding to 1)location and 2)confidence preds.
"""
decoded_priors = decode(loc_data, priors, cfg.use_yolo_regressors) if cfg.use_prediction_matching else point_form(priors)
# Size [num_objects, num_priors]
overlaps = jaccard(truths, decoded_priors) if not cfg.use_change_matching else change(truths, decoded_priors)
# Size [num_priors] best ground truth for each prior
best_truth_overlap, best_truth_idx = overlaps.max(0)
# We want to ensure that each gt gets used at least once so that we don't
# waste any training data. In order to do that, find the max overlap anchor
# with each gt, and force that anchor to use that gt.
for _ in range(overlaps.size(0)):
# Find j, the gt with the highest overlap with a prior
# In effect, this will loop through overlaps.size(0) in a "smart" order,
# always choosing the highest overlap first.
best_prior_overlap, best_prior_idx = overlaps.max(1)
j = best_prior_overlap.max(0)[1]
# Find i, the highest overlap anchor with this gt
i = best_prior_idx[j]
# Set all other overlaps with i to be -1 so that no other gt uses it
overlaps[:, i] = -1
# Set all other overlaps with j to be -1 so that this loop never uses j again
overlaps[j, :] = -1
# Overwrite i's score to be 2 so it doesn't get thresholded ever
best_truth_overlap[i] = 2
# Set the gt to be used for i to be j, overwriting whatever was there
best_truth_idx[i] = j
matches = truths[best_truth_idx] # Shape: [num_priors,4]
conf = labels[best_truth_idx] + 1 # Shape: [num_priors]
conf[best_truth_overlap < pos_thresh] = -1 # label as neutral
conf[best_truth_overlap < neg_thresh] = 0 # label as background
# Deal with crowd annotations for COCO
if crowd_boxes is not None and cfg.crowd_iou_threshold < 1:
# Size [num_priors, num_crowds]
crowd_overlaps = jaccard(decoded_priors, crowd_boxes, iscrowd=True)
# Size [num_priors]
best_crowd_overlap, best_crowd_idx = crowd_overlaps.max(1)
# Set non-positives with crowd iou of over the threshold to be neutral.
conf[(conf <= 0) & (best_crowd_overlap > cfg.crowd_iou_threshold)] = -1
loc = encode(matches, priors, cfg.use_yolo_regressors)
loc_t[idx] = loc # [num_priors,4] encoded offsets to learn
conf_t[idx] = conf # [num_priors] top class label for each prior
idx_t[idx] = best_truth_idx # [num_priors] indices for lookup
@torch.jit.script
def encode(matched, priors, use_yolo_regressors:bool=False):
"""
Encode bboxes matched with each prior into the format
produced by the network. See decode for more details on
this format. Note that encode(decode(x, p), p) = x.
Args:
- matched: A tensor of bboxes in point form with shape [num_priors, 4]
- priors: The tensor of all priors with shape [num_priors, 4]
Return: A tensor with encoded relative coordinates in the format
outputted by the network (see decode). Size: [num_priors, 4]
"""
if use_yolo_regressors:
# Exactly the reverse of what we did in decode
# In fact encode(decode(x, p), p) should be x
boxes = center_size(matched)
loc = torch.cat((
boxes[:, :2] - priors[:, :2],
torch.log(boxes[:, 2:] / priors[:, 2:])
), 1)
else:
variances = [0.1, 0.2]
# dist b/t match center and prior's center
g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
# encode variance
g_cxcy /= (variances[0] * priors[:, 2:])
# match wh / prior wh
g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
g_wh = torch.log(g_wh) / variances[1]
# return target for smooth_l1_loss
loc = torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
return loc
@torch.jit.script
def decode(loc, priors, use_yolo_regressors:bool=False):
"""
Decode predicted bbox coordinates using the same scheme
employed by Yolov2: https://arxiv.org/pdf/1612.08242.pdf
b_x = (sigmoid(pred_x) - .5) / conv_w + prior_x
b_y = (sigmoid(pred_y) - .5) / conv_h + prior_y
b_w = prior_w * exp(loc_w)
b_h = prior_h * exp(loc_h)
Note that loc is inputed as [(s(x)-.5)/conv_w, (s(y)-.5)/conv_h, w, h]
while priors are inputed as [x, y, w, h] where each coordinate
is relative to size of the image (even sigmoid(x)). We do this
in the network by dividing by the 'cell size', which is just
the size of the convouts.
Also note that prior_x and prior_y are center coordinates which
is why we have to subtract .5 from sigmoid(pred_x and pred_y).
Args:
- loc: The predicted bounding boxes of size [num_priors, 4]
- priors: The priorbox coords with size [num_priors, 4]
Returns: A tensor of decoded relative coordinates in point form
form with size [num_priors, 4]
"""
if use_yolo_regressors:
# Decoded boxes in center-size notation
boxes = torch.cat((
loc[:, :2] + priors[:, :2],
priors[:, 2:] * torch.exp(loc[:, 2:])
), 1)
boxes = point_form(boxes)
else:
variances = [0.1, 0.2]
boxes = torch.cat((
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
boxes[:, :2] -= boxes[:, 2:] / 2
boxes[:, 2:] += boxes[:, :2]
return boxes
def log_sum_exp(x):
"""Utility function for computing log_sum_exp while determining
This will be used to determine unaveraged confidence loss across
all examples in a batch.
Args:
x (Variable(tensor)): conf_preds from conf layers
"""
x_max = x.data.max()
return torch.log(torch.sum(torch.exp(x-x_max), 1)) + x_max
@torch.jit.script
def sanitize_coordinates(_x1, _x2, img_size:int, padding:int=0, cast:bool=True):
"""
Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0, and x2 <= image_size.
Also converts from relative to absolute coordinates and casts the results to long tensors.
If cast is false, the result won't be cast to longs.
Warning: this does things in-place behind the scenes so copy if necessary.
"""
_x1 = _x1 * img_size
_x2 = _x2 * img_size
if cast:
_x1 = _x1.long()
_x2 = _x2.long()
x1 = torch.min(_x1, _x2)
x2 = torch.max(_x1, _x2)
x1 = torch.clamp(x1-padding, min=0)
x2 = torch.clamp(x2+padding, max=img_size)
return x1, x2
@torch.jit.script
def crop(masks, boxes, padding:int=1):
"""
"Crop" predicted masks by zeroing out everything not in the predicted bbox.
Vectorized by Chong (thanks Chong).
Args:
- masks should be a size [h, w, n] tensor of masks
- boxes should be a size [n, 4] tensor of bbox coords in relative point form
"""
h, w, n = masks.size()
x1, x2 = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, padding, cast=False)
y1, y2 = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, padding, cast=False)
rows = torch.arange(w, device=masks.device, dtype=x1.dtype).view(1, -1, 1).expand(h, w, n)
cols = torch.arange(h, device=masks.device, dtype=x1.dtype).view(-1, 1, 1).expand(h, w, n)
masks_left = rows >= x1.view(1, 1, -1)
masks_right = rows < x2.view(1, 1, -1)
masks_up = cols >= y1.view(1, 1, -1)
masks_down = cols < y2.view(1, 1, -1)
crop_mask = masks_left * masks_right * masks_up * masks_down
return masks * crop_mask.float()
def index2d(src, idx):
"""
Indexes a tensor by a 2d index.
In effect, this does
out[i, j] = src[i, idx[i, j]]
Both src and idx should have the same size.
"""
offs = torch.arange(idx.size(0), device=idx.device)[:, None].expand_as(idx)
idx = idx + offs * idx.size(1)
return src.view(-1)[idx.view(-1)].view(idx.size())
================================================
FILE: layers/functions/__init__.py
================================================
from .detection import Detect
__all__ = ['Detect']
================================================
FILE: layers/functions/detection.py
================================================
import torch
import torch.nn.functional as F
from ..box_utils import decode, jaccard, distance, diou, index2d
from utils import timer
from data import cfg, mask_type
import numpy as np
class Detect(object):
"""At test time, Detect is the final layer of SSD. Decode location preds,
apply non-maximum suppression to location predictions based on conf
scores and threshold to a top_k number of output predictions for both
confidence score and locations, as the predicted masks.
"""
# TODO: Refactor this whole class away. It needs to go.
def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh):
self.num_classes = num_classes
self.background_label = bkg_label
self.top_k = top_k
# Parameters used in nms.
self.nms_thresh = nms_thresh
if nms_thresh <= 0:
raise ValueError('nms_threshold must be non negative.')
self.conf_thresh = conf_thresh
self.use_cross_class_nms = False
self.use_fast_nms = False
self.use_cluster_nms = False
self.use_cluster_diounms = False
self.use_spm_nms = False
self.use_spm_dist_nms = False
self.use_spm_dist_weighted_nms = False
def __call__(self, predictions, net):
"""
Args:
loc_data: (tensor) Loc preds from loc layers
Shape: [batch, num_priors, 4]
conf_data: (tensor) Shape: Conf preds from conf layers
Shape: [batch, num_priors, num_classes]
mask_data: (tensor) Mask preds from mask layers
Shape: [batch, num_priors, mask_dim]
prior_data: (tensor) Prior boxes and variances from priorbox layers
Shape: [num_priors, 4]
proto_data: (tensor) If using mask_type.lincomb, the prototype masks
Shape: [batch, mask_h, mask_w, mask_dim]
Returns:
output of shape (batch_size, top_k, 1 + 1 + 4 + mask_dim)
These outputs are in the order: class idx, confidence, bbox coords, and mask.
Note that the outputs are sorted only if cross_class_nms is False
"""
loc_data = predictions['loc']
conf_data = predictions['conf']
mask_data = predictions['mask']
prior_data = predictions['priors']
proto_data = predictions['proto'] if 'proto' in predictions else None
inst_data = predictions['inst'] if 'inst' in predictions else None
out = []
with timer.env('Detect'):
batch_size = loc_data.size(0)
num_priors = prior_data.size(0)
conf_preds = conf_data.view(batch_size, num_priors, self.num_classes).transpose(2, 1).contiguous()
for batch_idx in range(batch_size):
decoded_boxes = decode(loc_data[batch_idx], prior_data)
result = self.detect(batch_idx, conf_preds, decoded_boxes, mask_data, inst_data)
if result is not None and proto_data is not None:
result['proto'] = proto_data[batch_idx]
out.append({'detection': result, 'net': net})
return out
def detect(self, batch_idx, conf_preds, decoded_boxes, mask_data, inst_data):
""" Perform nms for only the max scoring class that isn't background (class 0) """
cur_scores = conf_preds[batch_idx, 1:, :]
conf_scores, _ = torch.max(cur_scores, dim=0)
keep = (conf_scores > self.conf_thresh)
scores = cur_scores[:, keep]
boxes = decoded_boxes[keep, :]
masks = mask_data[batch_idx, keep, :]
if inst_data is not None:
inst = inst_data[batch_idx, keep, :]
if scores.size(1) == 0:
return None
if self.use_cross_class_nms:
if self.use_fast_nms:
boxes, masks, classes, scores = self.cc_fast_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_cluster_nms:
boxes, masks, classes, scores = self.cc_cluster_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_cluster_diounms:
boxes, masks, classes, scores = self.cc_cluster_diounms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_spm_nms:
boxes, masks, classes, scores = self.cc_cluster_SPM_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_spm_dist_nms:
boxes, masks, classes, scores = self.cc_cluster_SPM_dist_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_spm_dist_weighted_nms:
boxes, masks, classes, scores = self.cc_cluster_SPM_dist_weighted_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
else:
if self.use_fast_nms:
boxes, masks, classes, scores = self.fast_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_cluster_nms:
boxes, masks, classes, scores = self.cluster_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_cluster_diounms:
boxes, masks, classes, scores = self.cluster_diounms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_spm_nms:
boxes, masks, classes, scores = self.cluster_SPM_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_spm_dist_nms:
boxes, masks, classes, scores = self.cluster_SPM_dist_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
if self.use_spm_dist_weighted_nms:
boxes, masks, classes, scores = self.cluster_SPM_dist_weighted_nms(boxes, masks, scores, self.nms_thresh, self.top_k)
return {'box': boxes, 'mask': masks, 'class': classes, 'score': scores}
def cc_fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200):
# Collapse all the classes into 1
scores, classes = scores.max(dim=0)
_, idx = scores.sort(0, descending=True)
idx = idx[:top_k]
boxes_idx = boxes[idx]
iou = jaccard(boxes_idx, boxes_idx).triu_(diagonal=1)
maxA,_=torch.max(iou, dim=0)
idx_out = idx[maxA <= iou_threshold]
return boxes[idx_out], masks[idx_out], classes[idx_out], scores[idx_out]
def cc_cluster_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200):
# Collapse all the classes into 1
scores, classes = scores.max(dim=0)
_, idx = scores.sort(0, descending=True)
idx = idx[:top_k]
boxes_idx = boxes[idx]
iou = jaccard(boxes_idx, boxes_idx).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_=torch.max(A, dim=0)
E = (maxA<=iou_threshold).float().unsqueeze(1).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
idx_out = idx[maxA <= iou_threshold]
return boxes[idx_out], masks[idx_out], classes[idx_out], scores[idx_out]
def cc_cluster_diounms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200):
# Collapse all the classes into 1
scores, classes = scores.max(dim=0)
_, idx = scores.sort(0, descending=True)
idx = idx[:top_k]
boxes_idx = boxes[idx]
iou = diou(boxes_idx, boxes_idx).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_=torch.max(A, dim=0)
E = (maxA<=iou_threshold).float().unsqueeze(1).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
idx_out = idx[maxA <= iou_threshold]
return boxes[idx_out], masks[idx_out], classes[idx_out], scores[idx_out]
def cc_cluster_SPM_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200):
# Collapse all the classes into 1
scores, classes = scores.max(dim=0)
_, idx = scores.sort(0, descending=True)
idx = idx[:top_k]
boxes_idx = boxes[idx]
scores = scores[idx]
boxes = boxes_idx
masks = masks[idx]
classes = classes[idx]
iou = jaccard(boxes_idx, boxes_idx).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_=torch.max(A, dim=0)
E = (maxA<=iou_threshold).float().unsqueeze(1).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
scores = torch.prod(torch.exp(-B**2/0.2),0)*scores
idx_out = scores > 0.01
return boxes[idx_out], masks[idx_out], classes[idx_out], scores[idx_out]
def cc_cluster_SPM_dist_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200):
# Collapse all the classes into 1
scores, classes = scores.max(dim=0)
_, idx = scores.sort(0, descending=True)
idx = idx[:top_k]
boxes_idx = boxes[idx]
scores = scores[idx]
boxes = boxes_idx
masks = masks[idx]
classes = classes[idx]
iou = jaccard(boxes_idx, boxes_idx).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_=torch.max(A, dim=0)
E = (maxA<=iou_threshold).float().unsqueeze(1).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
D=distance(boxes, boxes)
X = (B>=0).float()
scores = torch.prod(torch.min(torch.exp(-B**2/0.2)+D*((B>0).float()),X),0)*scores
idx_out = scores > 0.01
return boxes[idx_out], masks[idx_out], classes[idx_out], scores[idx_out]
def cc_cluster_SPM_dist_weighted_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200):
# Collapse all the classes into 1
scores, classes = scores.max(dim=0)
_, idx = scores.sort(0, descending=True)
idx = idx[:top_k]
boxes_idx = boxes[idx]
scores = scores[idx]
boxes = boxes_idx
masks = masks[idx]
classes = classes[idx]
n = len(scores)
iou = jaccard(boxes_idx, boxes_idx).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_=torch.max(A, dim=0)
E = (maxA<=iou_threshold).float().unsqueeze(1).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
D=distance(boxes, boxes)
X = (B>=0).float()
scores = torch.prod(torch.min(torch.exp(-B**2/0.2)+D*((B>0).float()),X),0)*scores
idx_out = scores > 0.01
weights = (B*(B>0.8).float() + torch.eye(n).cuda()) * (scores.reshape((1,n)))
xx1 = boxes[:,0].expand(n,n)
yy1 = boxes[:,1].expand(n,n)
xx2 = boxes[:,2].expand(n,n)
yy2 = boxes[:,3].expand(n,n)
weightsum=weights.sum(dim=1)
xx1 = (xx1*weights).sum(dim=1)/(weightsum)
yy1 = (yy1*weights).sum(dim=1)/(weightsum)
xx2 = (xx2*weights).sum(dim=1)/(weightsum)
yy2 = (yy2*weights).sum(dim=1)/(weightsum)
boxes = torch.stack([xx1, yy1, xx2, yy2], 1)
return boxes[idx_out], masks[idx_out], classes[idx_out], scores[idx_out]
def fast_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous()
scores = scores[:, :top_k]
num_classes, num_dets = idx.size()
boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = jaccard(boxes, boxes).triu_(diagonal=1)
iou_max, _ = iou.max(dim=1)
# Now just filter out the ones higher than the threshold
keep = (iou_max <= iou_threshold)
# We should also only keep detections over the confidence threshold, but at the cost of
# maxing out your detection count for every image, you can just not do that. Because we
# have such a minimal amount of computation per detection (matrix mulitplication only),
# this increase doesn't affect us much (+0.2 mAP for 34 -> 33 fps), so we leave it out.
# However, when you implement this in your method, you should do this second threshold.
if second_threshold:
keep *= (scores > self.conf_thresh)
keep *= (scores > 0.01)
# Assign each kept detection to its corresponding class
classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
masks = masks[keep]
scores = scores[keep]
# Only keep the top cfg.max_num_detections highest scores across all classes
scores, idx = scores.sort(0, descending=True)
idx = idx[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx]
boxes = boxes[idx]
masks = masks[idx]
return boxes, masks, classes, scores
def cluster_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous()
scores = scores[:, :top_k]
num_classes, num_dets = idx.size()
boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = jaccard(boxes, boxes).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_ = A.max(dim=1)
E = (maxA <= iou_threshold).float().unsqueeze(2).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
keep = (maxA <= iou_threshold)
keep *= (scores > 0.01)
# Assign each kept detection to its corresponding class
classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
masks = masks[keep]
scores = scores[keep]
# Only keep the top cfg.max_num_detections highest scores across all classes
scores, idx = scores.sort(0, descending=True)
idx = idx[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx]
boxes = boxes[idx]
masks = masks[idx]
return boxes, masks, classes, scores
def cluster_diounms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous()
scores = scores[:, :top_k]
num_classes, num_dets = idx.size()
boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = diou(boxes, boxes).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_ = A.max(dim=1)
E = (maxA <= iou_threshold).float().unsqueeze(2).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
keep = (maxA <= iou_threshold) * (scores > 0.01)
# Assign each kept detection to its corresponding class
classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
masks = masks[keep]
scores = scores[keep]
# Only keep the top cfg.max_num_detections highest scores across all classes
scores, idx = scores.sort(0, descending=True)
idx = idx[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx]
boxes = boxes[idx]
masks = masks[idx]
return boxes, masks, classes, scores
def cluster_SPM_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous()
scores = scores[:, :top_k]
num_classes, num_dets = idx.size()
boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = jaccard(boxes, boxes).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_ = A.max(dim=1)
E = (maxA <= iou_threshold).float().unsqueeze(2).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
scores = torch.prod(torch.exp(-B**2/0.2),1)*scores
keep = (scores > 0.01)
#print('keep',torch.sum(keep))
# Assign each kept detection to its corresponding class
classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
masks = masks[keep]
scores = scores[keep]
# Only keep the top cfg.max_num_detections highest scores across all classes
scores, idx = scores.sort(0, descending=True)
idx = idx[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx]
boxes = boxes[idx]
masks = masks[idx]
return boxes, masks, classes, scores
def cluster_SPM_dist_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous()
scores = scores[:, :top_k]
num_classes, num_dets = idx.size()
boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = jaccard(boxes, boxes).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_ = A.max(dim=1)
E = (maxA <= iou_threshold).float().unsqueeze(2).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
D=distance(boxes, boxes)
X = (B>=0).float()
scores = torch.prod(torch.min(torch.exp(-B**2/0.2)+D*((B>0).float()),X),1)*scores
keep = (scores > 0.01)
# Assign each kept detection to its corresponding class
classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
masks = masks[keep]
scores = scores[keep]
# Only keep the top cfg.max_num_detections highest scores across all classes
scores, idx = scores.sort(0, descending=True)
idx = idx[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx]
boxes = boxes[idx]
masks = masks[idx]
return boxes, masks, classes, scores
def cluster_SPM_dist_weighted_nms(self, boxes, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous()
scores = scores[:, :top_k]
num_classes, num_dets = idx.size()
boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = jaccard(boxes, boxes).triu_(diagonal=1)
B = iou
for i in range(200):
A=B
maxA,_ = A.max(dim=1)
E = (maxA <= iou_threshold).float().unsqueeze(2).expand_as(A)
B=iou.mul(E)
if A.equal(B)==True:
break
D=distance(boxes, boxes)
X = (B>=0).float()
scores = torch.prod(torch.min(torch.exp(-B**2/0.2)+D*((B>0).float()),X),1)*scores
keep = (scores > 0.01)
E = keep.float().unsqueeze(2).expand_as(A)
B=iou.mul(E)
_,n = scores.size()
weights = (B*(B>0.8).float() + torch.eye(n).cuda().expand(80,n,n)) * (scores.unsqueeze(2).expand(80,n,n))
xx1 = boxes[:,:,0].unsqueeze(1).expand(80,n,n)
yy1 = boxes[:,:,1].unsqueeze(1).expand(80,n,n)
xx2 = boxes[:,:,2].unsqueeze(1).expand(80,n,n)
yy2 = boxes[:,:,3].unsqueeze(1).expand(80,n,n)
weightsum=weights.sum(dim=2)
xx1 = (xx1*weights).sum(dim=2)/(weightsum)
yy1 = (yy1*weights).sum(dim=2)/(weightsum)
xx2 = (xx2*weights).sum(dim=2)/(weightsum)
yy2 = (yy2*weights).sum(dim=2)/(weightsum)
boxes = torch.stack([xx1, yy1, xx2, yy2], 2)
# Assign each kept detection to its corresponding class
classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
masks = masks[keep]
scores = scores[keep]
# Only keep the top cfg.max_num_detections highest scores across all classes
scores, idx = scores.sort(0, descending=True)
idx = idx[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx]
boxes = boxes[idx]
masks = masks[idx]
return boxes, masks, classes, scores
def traditional_nms_yolact(self, boxes, masks, scores, iou_threshold=0.5, conf_thresh=0.05):
import pyximport
pyximport.install(setup_args={"include_dirs":np.get_include()}, reload_support=True)
from utils.cython_nms import nms as cnms
num_classes = scores.size(0)
idx_lst = []
cls_lst = []
scr_lst = []
# Multiplying by max_size is necessary because of how cnms computes its area and intersections
boxes = boxes * cfg.max_size
for _cls in range(num_classes):
cls_scores = scores[_cls, :]
conf_mask = cls_scores > conf_thresh
idx = torch.arange(cls_scores.size(0), device=boxes.device)
cls_scores = cls_scores[conf_mask]
idx = idx[conf_mask]
if cls_scores.size(0) == 0:
continue
preds = torch.cat([boxes[conf_mask], cls_scores[:, None]], dim=1).cpu().numpy()
keep = cnms(preds, iou_threshold)
keep = torch.Tensor(keep, device=boxes.device).long()
idx_lst.append(idx[keep])
cls_lst.append(keep * 0 + _cls)
scr_lst.append(cls_scores[keep])
idx = torch.cat(idx_lst, dim=0)
classes = torch.cat(cls_lst, dim=0)
scores = torch.cat(scr_lst, dim=0)
scores, idx2 = scores.sort(0, descending=True)
idx2 = idx2[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
idx = idx[idx2]
classes = classes[idx2]
# Undo the multiplication above
return boxes[idx] / cfg.max_size, masks[idx], classes, scores
def traditional_nms_ours(self, boxes, masks, scores, iou_threshold=0.5, conf_thresh=0.05):
import pyximport
pyximport.install(setup_args={"include_dirs":np.get_include()}, reload_support=True)
from utils.cython_nms import nms as cnms
num_classes = scores.size(0)
idx_lst = []
cls_lst = []
scr_lst = []
box_lst = []
mask_lst = []
# Multiplying by max_size is necessary because of how cnms computes its area and intersections
boxes = boxes * cfg.max_size
for _cls in range(num_classes):
cls_scores = scores[_cls, :]
_, id = cls_scores.sort(0, descending=True)
id = id[:200].contiguous()
cls_scores = cls_scores[id]
idx = torch.arange(cls_scores.size(0), device=boxes.device)
if cls_scores.size(0) == 0:
continue
preds = torch.cat([boxes[id], cls_scores[:, None]], dim=1).cpu().numpy()
keep = cnms(preds, iou_threshold)
keep = torch.Tensor(keep, device=boxes.device).long()
m = (cls_scores[keep] > 0.01)
idx_lst.append(idx[keep][m])
cls_lst.append(keep[m] * 0 + _cls)
scr_lst.append(cls_scores[keep][m])
box_lst.append(boxes[id][keep][m])
mask_lst.append(masks[id][keep][m])
idx = torch.cat(idx_lst, dim=0)
classes = torch.cat(cls_lst, dim=0)
scores = torch.cat(scr_lst, dim=0)
boxes = torch.cat(box_lst, dim=0)
masks = torch.cat(mask_lst, dim=0)
scores, idx2 = scores.sort(0, descending=True)
idx2 = idx2[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx2]
# Undo the multiplication above
return boxes[idx2] / cfg.max_size, masks[idx2], classes, scores
================================================
FILE: layers/interpolate.py
================================================
import torch.nn as nn
import torch.nn.functional as F
class InterpolateModule(nn.Module):
"""
This is a module version of F.interpolate (rip nn.Upsampling).
Any arguments you give it just get passed along for the ride.
"""
def __init__(self, *args, **kwdargs):
super().__init__()
self.args = args
self.kwdargs = kwdargs
def forward(self, x):
return F.interpolate(x, *self.args, **self.kwdargs)
================================================
FILE: layers/modules/__init__.py
================================================
from .multibox_loss import MultiBoxLoss
__all__ = ['MultiBoxLoss']
================================================
FILE: layers/modules/multibox_loss.py
================================================
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.autograd import Variable
from ..box_utils import match, log_sum_exp, decode, center_size, crop, elemwise_mask_iou, elemwise_box_iou
from data import cfg, mask_type, activation_func
def ciou(bboxes1, bboxes2):
bboxes1 = torch.sigmoid(bboxes1)
bboxes2 = torch.sigmoid(bboxes2)
rows = bboxes1.shape[0]
cols = bboxes2.shape[0]
cious = torch.zeros((rows, cols))
if rows * cols == 0:
return cious
exchange = False
if bboxes1.shape[0] > bboxes2.shape[0]:
bboxes1, bboxes2 = bboxes2, bboxes1
cious = torch.zeros((cols, rows))
exchange = True
w1 = torch.exp(bboxes1[:, 2])
h1 = torch.exp(bboxes1[:, 3])
w2 = torch.exp(bboxes2[:, 2])
h2 = torch.exp(bboxes2[:, 3])
area1 = w1 * h1
area2 = w2 * h2
center_x1 = bboxes1[:, 0]
center_y1 = bboxes1[:, 1]
center_x2 = bboxes2[:, 0]
center_y2 = bboxes2[:, 1]
inter_l = torch.max(center_x1 - w1 / 2,center_x2 - w2 / 2)
inter_r = torch.min(center_x1 + w1 / 2,center_x2 + w2 / 2)
inter_t = torch.max(center_y1 - h1 / 2,center_y2 - h2 / 2)
inter_b = torch.min(center_y1 + h1 / 2,center_y2 + h2 / 2)
inter_area = torch.clamp((inter_r - inter_l),min=0) * torch.clamp((inter_b - inter_t),min=0)
c_l = torch.min(center_x1 - w1 / 2,center_x2 - w2 / 2)
c_r = torch.max(center_x1 + w1 / 2,center_x2 + w2 / 2)
c_t = torch.min(center_y1 - h1 / 2,center_y2 - h2 / 2)
c_b = torch.max(center_y1 + h1 / 2,center_y2 + h2 / 2)
inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2
c_diag = torch.clamp((c_r - c_l),min=0)**2 + torch.clamp((c_b - c_t),min=0)**2
union = area1+area2-inter_area
u = (inter_diag) / c_diag
iou = inter_area / union
v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(w2 / h2) - torch.atan(w1 / h1)), 2)
with torch.no_grad():
S = (iou>0.5).float()
alpha= S*v/(1-iou+v)
cious = iou - u - alpha * v
cious = torch.clamp(cious,min=-1.0,max = 1.0)
if exchange:
cious = cious.T
return torch.sum(1-cious)
def diou(bboxes1, bboxes2):
bboxes1 = torch.sigmoid(bboxes1)
bboxes2 = torch.sigmoid(bboxes2)
rows = bboxes1.shape[0]
cols = bboxes2.shape[0]
cious = torch.zeros((rows, cols))
if rows * cols == 0:
return cious
exchange = False
if bboxes1.shape[0] > bboxes2.shape[0]:
bboxes1, bboxes2 = bboxes2, bboxes1
cious = torch.zeros((cols, rows))
exchange = True
w1 = torch.exp(bboxes1[:, 2])
h1 = torch.exp(bboxes1[:, 3])
w2 = torch.exp(bboxes2[:, 2])
h2 = torch.exp(bboxes2[:, 3])
area1 = w1 * h1
area2 = w2 * h2
center_x1 = bboxes1[:, 0]
center_y1 = bboxes1[:, 1]
center_x2 = bboxes2[:, 0]
center_y2 = bboxes2[:, 1]
inter_l = torch.max(center_x1 - w1 / 2,center_x2 - w2 / 2)
inter_r = torch.min(center_x1 + w1 / 2,center_x2 + w2 / 2)
inter_t = torch.max(center_y1 - h1 / 2,center_y2 - h2 / 2)
inter_b = torch.min(center_y1 + h1 / 2,center_y2 + h2 / 2)
inter_area = torch.clamp((inter_r - inter_l),min=0) * torch.clamp((inter_b - inter_t),min=0)
c_l = torch.min(center_x1 - w1 / 2,center_x2 - w2 / 2)
c_r = torch.max(center_x1 + w1 / 2,center_x2 + w2 / 2)
c_t = torch.min(center_y1 - h1 / 2,center_y2 - h2 / 2)
c_b = torch.max(center_y1 + h1 / 2,center_y2 + h2 / 2)
inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2
c_diag = torch.clamp((c_r - c_l),min=0)**2 + torch.clamp((c_b - c_t),min=0)**2
union = area1+area2-inter_area
u = (inter_diag) / c_diag
iou = inter_area / union
dious = iou - u
dious = torch.clamp(dious,min=-1.0,max = 1.0)
if exchange:
dious = dious.T
return torch.sum(1-dious)
class MultiBoxLoss(nn.Module):
"""SSD Weighted Loss Function
Compute Targets:
1) Produce Confidence Target Indices by matching ground truth boxes
with (default) 'priorboxes' that have jaccard index > threshold parameter
(default threshold: 0.5).
2) Produce localization target by 'encoding' variance into offsets of ground
truth boxes and their matched 'priorboxes'.
3) Hard negative mining to filter the excessive number of negative examples
that comes with using a large number of default bounding boxes.
(default negative:positive ratio 3:1)
Objective Loss:
L(x,c,l,g) = (Lconf(x, c) + 伪Lloc(x,l,g)) / N
Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
weighted by 伪 which is set to 1 by cross val.
Args:
c: class confidences,
l: predicted boxes,
g: ground truth boxes
N: number of matched default boxes
See: https://arxiv.org/pdf/1512.02325.pdf for more details.
"""
def __init__(self, num_classes, pos_threshold, neg_threshold, negpos_ratio):
super(MultiBoxLoss, self).__init__()
self.num_classes = num_classes
self.pos_threshold = pos_threshold
self.neg_threshold = neg_threshold
self.negpos_ratio = negpos_ratio
# If you output a proto mask with this area, your l1 loss will be l1_alpha
# Note that the area is relative (so 1 would be the entire image)
self.l1_expected_area = 20*20/70/70
self.l1_alpha = 0.1
if cfg.use_class_balanced_conf:
self.class_instances = None
self.total_instances = 0
def forward(self, net, predictions, targets, masks, num_crowds):
"""Multibox Loss
Args:
predictions (tuple): A tuple containing loc preds, conf preds,
mask preds, and prior boxes from SSD net.
loc shape: torch.size(batch_size,num_priors,4)
conf shape: torch.size(batch_size,num_priors,num_classes)
masks shape: torch.size(batch_size,num_priors,mask_dim)
priors shape: torch.size(num_priors,4)
proto* shape: torch.size(batch_size,mask_h,mask_w,mask_dim)
targets (list): Ground truth boxes and labels for a batch,
shape: [batch_size][num_objs,5] (last idx is the label).
masks (list): Ground truth masks for each object in each image,
shape: [batch_size][num_objs,im_height,im_width]
num_crowds (list): Number of crowd annotations per batch. The crowd
annotations should be the last num_crowds elements of targets and masks.
* Only if mask_type == lincomb
"""
loc_data = predictions['loc']
conf_data = predictions['conf']
mask_data = predictions['mask']
priors = predictions['priors']
if cfg.mask_type == mask_type.lincomb:
proto_data = predictions['proto']
score_data = predictions['score'] if cfg.use_mask_scoring else None
inst_data = predictions['inst'] if cfg.use_instance_coeff else None
labels = [None] * len(targets) # Used in sem segm loss
batch_size = loc_data.size(0)
num_priors = priors.size(0)
num_classes = self.num_classes
# Match priors (default boxes) and ground truth boxes
# These tensors will be created with the same device as loc_data
loc_t = loc_data.new(batch_size, num_priors, 4)
gt_box_t = loc_data.new(batch_size, num_priors, 4)
conf_t = loc_data.new(batch_size, num_priors).long()
idx_t = loc_data.new(batch_size, num_priors).long()
if cfg.use_class_existence_loss:
class_existence_t = loc_data.new(batch_size, num_classes-1)
for idx in range(batch_size):
truths = targets[idx][:, :-1].data
labels[idx] = targets[idx][:, -1].data.long()
if cfg.use_class_existence_loss:
# Construct a one-hot vector for each object and collapse it into an existence vector with max
# Also it's fine to include the crowd annotations here
class_existence_t[idx, :] = torch.eye(num_classes-1, device=conf_t.get_device())[labels[idx]].max(dim=0)[0]
# Split the crowd annotations because they come bundled in
cur_crowds = num_crowds[idx]
if cur_crowds > 0:
split = lambda x: (x[-cur_crowds:], x[:-cur_crowds])
crowd_boxes, truths = split(truths)
# We don't use the crowd labels or masks
_, labels[idx] = split(labels[idx])
_, masks[idx] = split(masks[idx])
else:
crowd_boxes = None
match(self.pos_threshold, self.neg_threshold,
truths, priors.data, labels[idx], crowd_boxes,
loc_t, conf_t, idx_t, idx, loc_data[idx])
gt_box_t[idx, :, :] = truths[idx_t[idx]]
# wrap targets
loc_t = Variable(loc_t, requires_grad=False)
conf_t = Variable(conf_t, requires_grad=False)
idx_t = Variable(idx_t, requires_grad=False)
pos = conf_t > 0
num_pos = pos.sum(dim=1, keepdim=True)
# Shape: [batch,num_priors,4]
pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
losses = {}
# Localization Loss (Smooth L1)
if cfg.train_boxes:
loc_p = loc_data[pos_idx].view(-1, 4)
loc_t = loc_t[pos_idx].view(-1, 4)
if cfg.reg_loss == 'ciou':
losses['B'] = ciou(loc_p, loc_t) * cfg.bbox_alpha * 5
else:
if cfg.reg_loss == 'sl1':
losses['B'] = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') * cfg.bbox_alpha
else:
raise AssertionError("Currently, bbox regression surports 'ciou' or 'sl1'.")
if cfg.train_masks:
if cfg.mask_type == mask_type.direct:
if cfg.use_gt_bboxes:
pos_masks = []
for idx in range(batch_size):
pos_masks.append(masks[idx][idx_t[idx, pos[idx]]])
masks_t = torch.cat(pos_masks, 0)
masks_p = mask_data[pos, :].view(-1, cfg.mask_dim)
losses['M'] = F.binary_cross_entropy(torch.clamp(masks_p, 0, 1), masks_t, reduction='sum') * cfg.mask_alpha
else:
losses['M'] = self.direct_mask_loss(pos_idx, idx_t, loc_data, mask_data, priors, masks)
elif cfg.mask_type == mask_type.lincomb:
ret = self.lincomb_mask_loss(pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, labels)
if cfg.use_maskiou:
loss, maskiou_targets = ret
else:
loss = ret
losses.update(loss)
if cfg.mask_proto_loss is not None:
if cfg.mask_proto_loss == 'l1':
losses['P'] = torch.mean(torch.abs(proto_data)) / self.l1_expected_area * self.l1_alpha
elif cfg.mask_proto_loss == 'disj':
losses['P'] = -torch.mean(torch.max(F.log_softmax(proto_data, dim=-1), dim=-1)[0])
# Confidence loss
if cfg.use_focal_loss:
if cfg.use_sigmoid_focal_loss:
losses['C'] = self.focal_conf_sigmoid_loss(conf_data, conf_t)
elif cfg.use_objectness_score:
losses['C'] = self.focal_conf_objectness_loss(conf_data, conf_t)
else:
losses['C'] = self.focal_conf_loss(conf_data, conf_t)
else:
if cfg.use_objectness_score:
losses['C'] = self.conf_objectness_loss(conf_data, conf_t, batch_size, loc_p, loc_t, priors)
else:
losses['C'] = self.ohem_conf_loss(conf_data, conf_t, pos, batch_size)
# Mask IoU Loss
if cfg.use_maskiou and maskiou_targets is not None:
losses['I'] = self.mask_iou_loss(net, maskiou_targets)
# These losses also don't depend on anchors
if cfg.use_class_existence_loss:
losses['E'] = self.class_existence_loss(predictions['classes'], class_existence_t)
if cfg.use_semantic_segmentation_loss:
losses['S'] = self.semantic_segmentation_loss(predictions['segm'], masks, labels)
# Divide all losses by the number of positives.
# Don't do it for loss[P] because that doesn't depend on the anchors.
total_num_pos = num_pos.data.sum().float()
for k in losses:
if k not in ('P', 'E', 'S'):
losses[k] /= total_num_pos
else:
losses[k] /= batch_size
# Loss Key:
# - B: Box Localization Loss
# - C: Class Confidence Loss
# - M: Mask Loss
# - P: Prototype Loss
# - D: Coefficient Diversity Loss
# - E: Class Existence Loss
# - S: Semantic Segmentation Loss
return losses
def class_existence_loss(self, class_data, class_existence_t):
return cfg.class_existence_alpha * F.binary_cross_entropy_with_logits(class_data, class_existence_t, reduction='sum')
def semantic_segmentation_loss(self, segment_data, mask_t, class_t, interpolation_mode='bilinear'):
# Note num_classes here is without the background class so cfg.num_classes-1
batch_size, num_classes, mask_h, mask_w = segment_data.size()
loss_s = 0
for idx in range(batch_size):
cur_segment = segment_data[idx]
cur_class_t = class_t[idx]
with torch.no_grad():
downsampled_masks = F.interpolate(mask_t[idx].unsqueeze(0), (mask_h, mask_w),
mode=interpolation_mode, align_corners=False).squeeze(0)
downsampled_masks = downsampled_masks.gt(0.5).float()
# Construct Semantic Segmentation
segment_t = torch.zeros_like(cur_segment, requires_grad=False)
for obj_idx in range(downsampled_masks.size(0)):
segment_t[cur_class_t[obj_idx]] = torch.max(segment_t[cur_class_t[obj_idx]], downsampled_masks[obj_idx])
loss_s += F.binary_cross_entropy_with_logits(cur_segment, segment_t, reduction='sum')
return loss_s / mask_h / mask_w * cfg.semantic_segmentation_alpha
def ohem_conf_loss(self, conf_data, conf_t, pos, num):
# Compute max conf across batch for hard negative mining
batch_conf = conf_data.view(-1, self.num_classes)
if cfg.ohem_use_most_confident:
# i.e. max(softmax) along classes > 0
batch_conf = F.softmax(batch_conf, dim=1)
loss_c, _ = batch_conf[:, 1:].max(dim=1)
else:
# i.e. -softmax(class 0 confidence)
loss_c = log_sum_exp(batch_conf) - batch_conf[:, 0]
# Hard Negative Mining
loss_c = loss_c.view(num, -1)
loss_c[pos] = 0 # filter out pos boxes
loss_c[conf_t < 0] = 0 # filter out neutrals (conf_t = -1)
_, loss_idx = loss_c.sort(1, descending=True)
_, idx_rank = loss_idx.sort(1)
num_pos = pos.long().sum(1, keepdim=True)
num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
neg = idx_rank < num_neg.expand_as(idx_rank)
# Just in case there aren't enough negatives, don't start using positives as negatives
neg[pos] = 0
neg[conf_t < 0] = 0 # Filter out neutrals
# Confidence Loss Including Positive and Negative Examples
pos_idx = pos.unsqueeze(2).expand_as(conf_data)
neg_idx = neg.unsqueeze(2).expand_as(conf_data)
conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
targets_weighted = conf_t[(pos+neg).gt(0)]
loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='none')
if cfg.use_class_balanced_conf:
# Lazy initialization
if self.class_instances is None:
self.class_instances = torch.zeros(self.num_classes, device=targets_weighted.device)
classes, counts = targets_weighted.unique(return_counts=True)
for _cls, _cnt in zip(classes.cpu().numpy(), counts.cpu().numpy()):
self.class_instances[_cls] += _cnt
self.total_instances += targets_weighted.size(0)
weighting = 1 - (self.class_instances[targets_weighted] / self.total_instances)
weighting = torch.clamp(weighting, min=1/self.num_classes)
# If you do the math, the average weight of self.class_instances is this
avg_weight = (self.num_classes - 1) / self.num_classes
loss_c = (loss_c * weighting).sum() / avg_weight
else:
loss_c = loss_c.sum()
return cfg.conf_alpha * loss_c
def focal_conf_loss(self, conf_data, conf_t):
"""
Focal loss as described in https://arxiv.org/pdf/1708.02002.pdf
Adapted from https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
Note that this uses softmax and not the original sigmoid from the paper.
"""
conf_t = conf_t.view(-1) # [batch_size*num_priors]
conf_data = conf_data.view(-1, conf_data.size(-1)) # [batch_size*num_priors, num_classes]
# Ignore neutral samples (class < 0)
keep = (conf_t >= 0).float()
conf_t[conf_t < 0] = 0 # so that gather doesn't drum up a fuss
logpt = F.log_softmax(conf_data, dim=-1)
logpt = logpt.gather(1, conf_t.unsqueeze(-1))
logpt = logpt.view(-1)
pt = logpt.exp()
# I adapted the alpha_t calculation here from
# https://github.com/pytorch/pytorch/blob/master/modules/detectron/softmax_focal_loss_op.cu
# You'd think you want all the alphas to sum to one, but in the original implementation they
# just give background an alpha of 1-alpha and each forground an alpha of alpha.
background = (conf_t == 0).float()
at = (1 - cfg.focal_loss_alpha) * background + cfg.focal_loss_alpha * (1 - background)
loss = -at * (1 - pt) ** cfg.focal_loss_gamma * logpt
# See comment above for keep
return cfg.conf_alpha * (loss * keep).sum()
def focal_conf_sigmoid_loss(self, conf_data, conf_t):
"""
Focal loss but using sigmoid like the original paper.
Note: To make things mesh easier, the network still predicts 81 class confidences in this mode.
Because retinanet originally only predicts 80, we simply just don't use conf_data[..., 0]
"""
num_classes = conf_data.size(-1)
conf_t = conf_t.view(-1) # [batch_size*num_priors]
conf_data = conf_data.view(-1, num_classes) # [batch_size*num_priors, num_classes]
# Ignore neutral samples (class < 0)
keep = (conf_t >= 0).float()
conf_t[conf_t < 0] = 0 # can't mask with -1, so filter that out
# Compute a one-hot embedding of conf_t
# From https://github.com/kuangliu/pytorch-retinanet/blob/master/utils.py
conf_one_t = torch.eye(num_classes, device=conf_t.get_device())[conf_t]
conf_pm_t = conf_one_t * 2 - 1 # -1 if background, +1 if forground for specific class
logpt = F.logsigmoid(conf_data * conf_pm_t) # note: 1 - sigmoid(x) = sigmoid(-x)
pt = logpt.exp()
at = cfg.focal_loss_alpha * conf_one_t + (1 - cfg.focal_loss_alpha) * (1 - conf_one_t)
at[..., 0] = 0 # Set alpha for the background class to 0 because sigmoid focal loss doesn't use it
loss = -at * (1 - pt) ** cfg.focal_loss_gamma * logpt
loss = keep * loss.sum(dim=-1)
return cfg.conf_alpha * loss.sum()
def focal_conf_objectness_loss(self, conf_data, conf_t):
"""
Instead of using softmax, use class[0] to be the objectness score and do sigmoid focal loss on that.
Then for the rest of the classes, softmax them and apply CE for only the positive examples.
If class[0] = 1 implies forground and class[0] = 0 implies background then you achieve something
similar during test-time to softmax by setting class[1:] = softmax(class[1:]) * class[0] and invert class[0].
"""
conf_t = conf_t.view(-1) # [batch_size*num_priors]
conf_data = conf_data.view(-1, conf_data.size(-1)) # [batch_size*num_priors, num_classes]
# Ignore neutral samples (class < 0)
keep = (conf_t >= 0).float()
conf_t[conf_t < 0] = 0 # so that gather doesn't drum up a fuss
background = (conf_t == 0).float()
at = (1 - cfg.focal_loss_alpha) * background + cfg.focal_loss_alpha * (1 - background)
logpt = F.logsigmoid(conf_data[:, 0]) * (1 - background) + F.logsigmoid(-conf_data[:, 0]) * background
pt = logpt.exp()
obj_loss = -at * (1 - pt) ** cfg.focal_loss_gamma * logpt
# All that was the objectiveness loss--now time for the class confidence loss
pos_mask = conf_t > 0
conf_data_pos = (conf_data[:, 1:])[pos_mask] # Now this has just 80 classes
conf_t_pos = conf_t[pos_mask] - 1 # So subtract 1 here
class_loss = F.cross_entropy(conf_data_pos, conf_t_pos, reduction='sum')
return cfg.conf_alpha * (class_loss + (obj_loss * keep).sum())
def conf_objectness_loss(self, conf_data, conf_t, batch_size, loc_p, loc_t, priors):
"""
Instead of using softmax, use class[0] to be p(obj) * p(IoU) as in YOLO.
Then for the rest of the classes, softmax them and apply CE for only the positive examples.
"""
conf_t = conf_t.view(-1) # [batch_size*num_priors]
conf_data = conf_data.view(-1, conf_data.size(-1)) # [batch_size*num_priors, num_classes]
pos_mask = (conf_t > 0)
neg_mask = (conf_t == 0)
obj_data = conf_data[:, 0]
obj_data_pos = obj_data[pos_mask]
obj_data_neg = obj_data[neg_mask]
# Don't be confused, this is just binary cross entropy similified
obj_neg_loss = - F.logsigmoid(-obj_data_neg).sum()
with torch.no_grad():
pos_priors = priors.unsqueeze(0).expand(batch_size, -1, -1).reshape(-1, 4)[pos_mask, :]
boxes_pred = decode(loc_p, pos_priors, cfg.use_yolo_regressors)
boxes_targ = decode(loc_t, pos_priors, cfg.use_yolo_regressors)
iou_targets = elemwise_box_iou(boxes_pred, boxes_targ)
obj_pos_loss = - iou_targets * F.logsigmoid(obj_data_pos) - (1 - iou_targets) * F.logsigmoid(-obj_data_pos)
obj_pos_loss = obj_pos_loss.sum()
# All that was the objectiveness loss--now time for the class confidence loss
conf_data_pos = (conf_data[:, 1:])[pos_mask] # Now this has just 80 classes
conf_t_pos = conf_t[pos_mask] - 1 # So subtract 1 here
class_loss = F.cross_entropy(conf_data_pos, conf_t_pos, reduction='sum')
return cfg.conf_alpha * (class_loss + obj_pos_loss + obj_neg_loss)
def direct_mask_loss(self, pos_idx, idx_t, loc_data, mask_data, priors, masks):
""" Crops the gt masks using the predicted bboxes, scales them down, and outputs the BCE loss. """
loss_m = 0
for idx in range(mask_data.size(0)):
with torch.no_grad():
cur_pos_idx = pos_idx[idx, :, :]
cur_pos_idx_squeezed = cur_pos_idx[:, 1]
# Shape: [num_priors, 4], decoded predicted bboxes
pos_bboxes = decode(loc_data[idx, :, :], priors.data, cfg.use_yolo_regressors)
pos_bboxes = pos_bboxes[cur_pos_idx].view(-1, 4).clamp(0, 1)
pos_lookup = idx_t[idx, cur_pos_idx_squeezed]
cur_masks = masks[idx]
pos_masks = cur_masks[pos_lookup, :, :]
# Convert bboxes to absolute coordinates
num_pos, img_height, img_width = pos_masks.size()
# Take care of all the bad behavior that can be caused by out of bounds coordinates
x1, x2 = sanitize_coordinates(pos_bboxes[:, 0], pos_bboxes[:, 2], img_width)
y1, y2 = sanitize_coordinates(pos_bboxes[:, 1], pos_bboxes[:, 3], img_height)
# Crop each gt mask with the predicted bbox and rescale to the predicted mask size
# Note that each bounding box crop is a different size so I don't think we can vectorize this
scaled_masks = []
for jdx in range(num_pos):
tmp_mask = pos_masks[jdx, y1[jdx]:y2[jdx], x1[jdx]:x2[jdx]]
# Restore any dimensions we've left out because our bbox was 1px wide
while tmp_mask.dim() < 2:
tmp_mask = tmp_mask.unsqueeze(0)
new_mask = F.adaptive_avg_pool2d(tmp_mask.unsqueeze(0), cfg.mask_size)
scaled_masks.append(new_mask.view(1, -1))
mask_t = torch.cat(scaled_masks, 0).gt(0.5).float() # Threshold downsampled mask
pos_mask_data = mask_data[idx, cur_pos_idx_squeezed, :]
loss_m += F.binary_cross_entropy(torch.clamp(pos_mask_data, 0, 1), mask_t, reduction='sum') * cfg.mask_alpha
return loss_m
def coeff_diversity_loss(self, coeffs, instance_t):
"""
coeffs should be size [num_pos, num_coeffs]
instance_t should be size [num_pos] and be values from 0 to num_instances-1
"""
num_pos = coeffs.size(0)
instance_t = instance_t.view(-1) # juuuust to make sure
coeffs_norm = F.normalize(coeffs, dim=1)
cos_sim = coeffs_norm @ coeffs_norm.t()
inst_eq = (instance_t[:, None].expand_as(cos_sim) == instance_t[None, :].expand_as(cos_sim)).float()
# Rescale to be between 0 and 1
cos_sim = (cos_sim + 1) / 2
# If they're the same instance, use cosine distance, else use cosine similarity
loss = (1 - cos_sim) * inst_eq + cos_sim * (1 - inst_eq)
# Only divide by num_pos once because we're summing over a num_pos x num_pos tensor
# and all the losses will be divided by num_pos at the end, so just one extra time.
return cfg.mask_proto_coeff_diversity_alpha * loss.sum() / num_pos
def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, labels, interpolation_mode='bilinear'):
mask_h = proto_data.size(1)
mask_w = proto_data.size(2)
process_gt_bboxes = cfg.mask_proto_normalize_emulate_roi_pooling or cfg.mask_proto_crop
if cfg.mask_proto_remove_empty_masks:
# Make sure to store a copy of this because we edit it to get rid of all-zero masks
pos = pos.clone()
loss_m = 0
loss_d = 0 # Coefficient diversity loss
maskiou_t_list = []
maskiou_net_input_list = []
label_t_list = []
for idx in range(mask_data.size(0)):
with torch.no_grad():
downsampled_masks = F.interpolate(masks[idx].unsqueeze(0), (mask_h, mask_w),
mode=interpolation_mode, align_corners=False).squeeze(0)
downsampled_masks = downsampled_masks.permute(1, 2, 0).contiguous()
if cfg.mask_proto_binarize_downsampled_gt:
downsampled_masks = downsampled_masks.gt(0.5).float()
if cfg.mask_proto_remove_empty_masks:
# Get rid of gt masks that are so small they get downsampled away
very_small_masks = (downsampled_masks.sum(dim=(0,1)) <= 0.0001)
for i in range(very_small_masks.size(0)):
if very_small_masks[i]:
pos[idx, idx_t[idx] == i] = 0
if cfg.mask_proto_reweight_mask_loss:
# Ensure that the gt is binary
if not cfg.mask_proto_binarize_downsampled_gt:
bin_gt = downsampled_masks.gt(0.5).float()
else:
bin_gt = downsampled_masks
gt_foreground_norm = bin_gt / (torch.sum(bin_gt, dim=(0,1), keepdim=True) + 0.0001)
gt_background_norm = (1-bin_gt) / (torch.sum(1-bin_gt, dim=(0,1), keepdim=True) + 0.0001)
mask_reweighting = gt_foreground_norm * cfg.mask_proto_reweight_coeff + gt_background_norm
mask_reweighting *= mask_h * mask_w
cur_pos = pos[idx]
pos_idx_t = idx_t[idx, cur_pos]
if process_gt_bboxes:
# Note: this is in point-form
if cfg.mask_proto_crop_with_pred_box:
pos_gt_box_t = decode(loc_data[idx, :, :], priors.data, cfg.use_yolo_regressors)[cur_pos]
else:
pos_gt_box_t = gt_box_t[idx, cur_pos]
if pos_idx_t.size(0) == 0:
continue
proto_masks = proto_data[idx]
proto_coef = mask_data[idx, cur_pos, :]
if cfg.use_mask_scoring:
mask_scores = score_data[idx, cur_pos, :]
if cfg.mask_proto_coeff_diversity_loss:
if inst_data is not None:
div_coeffs = inst_data[idx, cur_pos, :]
else:
div_coeffs = proto_coef
loss_d += self.coeff_diversity_loss(div_coeffs, pos_idx_t)
# If we have over the allowed number of masks, select a random sample
old_num_pos = proto_coef.size(0)
if old_num_pos > cfg.masks_to_train:
perm = torch.randperm(proto_coef.size(0))
select = perm[:cfg.masks_to_train]
proto_coef = proto_coef[select, :]
pos_idx_t = pos_idx_t[select]
if process_gt_bboxes:
pos_gt_box_t = pos_gt_box_t[select, :]
if cfg.use_mask_scoring:
mask_scores = mask_scores[select, :]
num_pos = proto_coef.size(0)
mask_t = downsampled_masks[:, :, pos_idx_t]
label_t = labels[idx][pos_idx_t]
# Size: [mask_h, mask_w, num_pos]
pred_masks = proto_masks @ proto_coef.t()
pred_masks = cfg.mask_proto_mask_activation(pred_masks)
if cfg.mask_proto_double_loss:
if cfg.mask_proto_mask_activation == activation_func.sigmoid:
pre_loss = F.binary_cross_entropy(torch.clamp(pred_masks, 0, 1), mask_t, reduction='sum')
else:
pre_loss = F.smooth_l1_loss(pred_masks, mask_t, reduction='sum')
loss_m += cfg.mask_proto_double_loss_alpha * pre_loss
if cfg.mask_proto_crop:
pred_masks = crop(pred_masks, pos_gt_box_t)
if cfg.mask_proto_mask_activation == activation_func.sigmoid:
pre_loss = F.binary_cross_entropy(torch.clamp(pred_masks, 0, 1), mask_t, reduction='none')
else:
pre_loss = F.smooth_l1_loss(pred_masks, mask_t, reduction='none')
if cfg.mask_proto_normalize_mask_loss_by_sqrt_area:
gt_area = torch.sum(mask_t, dim=(0, 1), keepdim=True)
pre_loss = pre_loss / (torch.sqrt(gt_area) + 0.0001)
if cfg.mask_proto_reweight_mask_loss:
pre_loss = pre_loss * mask_reweighting[:, :, pos_idx_t]
if cfg.mask_proto_normalize_emulate_roi_pooling:
weight = mask_h * mask_w if cfg.mask_proto_crop else 1
pos_gt_csize = center_size(pos_gt_box_t)
gt_box_width = pos_gt_csize[:, 2] * mask_w
gt_box_height = pos_gt_csize[:, 3] * mask_h
pre_loss = pre_loss.sum(dim=(0, 1)) / gt_box_width / gt_box_height * weight
# If the number of masks were limited scale the loss accordingly
if old_num_pos > num_pos:
pre_loss *= old_num_pos / num_pos
loss_m += torch.sum(pre_loss)
if cfg.use_maskiou:
if cfg.discard_mask_area > 0:
gt_mask_area = torch.sum(mask_t, dim=(0, 1))
select = gt_mask_area > cfg.discard_mask_area
if torch.sum(select) < 1:
continue
pos_gt_box_t = pos_gt_box_t[select, :]
pred_masks = pred_masks[:, :, select]
mask_t = mask_t[:, :, select]
label_t = label_t[select]
maskiou_net_input = pred_masks.permute(2, 0, 1).contiguous().unsqueeze(1)
pred_masks = pred_masks.gt(0.5).float()
maskiou_t = self._mask_iou(pred_masks, mask_t)
maskiou_net_input_list.append(maskiou_net_input)
maskiou_t_list.append(maskiou_t)
label_t_list.append(label_t)
losses = {'M': loss_m * cfg.mask_alpha / mask_h / mask_w}
if cfg.mask_proto_coeff_diversity_loss:
losses['D'] = loss_d
if cfg.use_maskiou:
# discard_mask_area discarded every mask in the batch, so nothing to do here
if len(maskiou_t_list) == 0:
return losses, None
maskiou_t = torch.cat(maskiou_t_list)
label_t = torch.cat(label_t_list)
maskiou_net_input = torch.cat(maskiou_net_input_list)
num_samples = maskiou_t.size(0)
if cfg.maskious_to_train > 0 and num_samples > cfg.maskious_to_train:
perm = torch.randperm(num_samples)
select = perm[:cfg.masks_to_train]
maskiou_t = maskiou_t[select]
label_t = label_t[select]
maskiou_net_input = maskiou_net_input[select]
return losses, [maskiou_net_input, maskiou_t, label_t]
return losses
def _mask_iou(self, mask1, mask2):
intersection = torch.sum(mask1*mask2, dim=(0, 1))
area1 = torch.sum(mask1, dim=(0, 1))
area2 = torch.sum(mask2, dim=(0, 1))
union = (area1 + area2) - intersection
ret = intersection / union
return ret
def mask_iou_loss(self, net, maskiou_targets):
maskiou_net_input, maskiou_t, label_t = maskiou_targets
maskiou_p = net.maskiou_net(maskiou_net_input)
label_t = label_t[:, None]
maskiou_p = torch.gather(maskiou_p, dim=1, index=label_t).view(-1)
loss_i = F.smooth_l1_loss(maskiou_p, maskiou_t, reduction='sum')
return loss_i * cfg.maskiou_alpha
================================================
FILE: layers/output_utils.py
================================================
""" Contains functions used to sanitize and prepare the output of Yolact. """
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import cv2
from data import cfg, mask_type, MEANS, STD, activation_func
from utils.augmentations import Resize
from utils import timer
from .box_utils import crop, sanitize_coordinates
def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear',
visualize_lincomb=False, crop_masks=True, score_threshold=0):
"""
Postprocesses the output of Yolact on testing mode into a format that makes sense,
accounting for all the possible configuration settings.
Args:
- det_output: The lost of dicts that Detect outputs.
- w: The real with of the image.
- h: The real height of the image.
- batch_idx: If you have multiple images for this batch, the image's index in the batch.
- interpolation_mode: Can be 'nearest' | 'area' | 'bilinear' (see torch.nn.functional.interpolate)
Returns 4 torch Tensors (in the following order):
- classes [num_det]: The class idx for each detection.
- scores [num_det]: The confidence score for each detection.
- boxes [num_det, 4]: The bounding box for each detection in absolute point form.
- masks [num_det, h, w]: Full image masks for each detection.
"""
dets = det_output[batch_idx]
net = dets['net']
dets = dets['detection']
if dets is None:
return [torch.Tensor()] * 4 # Warning, this is 4 copies of the same thing
if score_threshold > 0:
keep = dets['score'] > score_threshold
for k in dets:
if k != 'proto':
dets[k] = dets[k][keep]
if dets['score'].size(0) == 0:
return [torch.Tensor()] * 4
# Actually extract everything from dets now
classes = dets['class']
boxes = dets['box']
scores = dets['score']
masks = dets['mask']
if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch:
# At this points masks is only the coefficients
proto_data = dets['proto']
# Test flag, do not upvote
if cfg.mask_proto_debug:
np.save('scripts/proto.npy', proto_data.cpu().numpy())
if visualize_lincomb:
display_lincomb(proto_data, masks)
masks = proto_data @ masks.t()
masks = cfg.mask_proto_mask_activation(masks)
# Crop masks before upsampling because you know why
if crop_masks:
masks = crop(masks, boxes)
# Permute into the correct output shape [num_dets, proto_h, proto_w]
masks = masks.permute(2, 0, 1).contiguous()
if cfg.use_maskiou:
with timer.env('maskiou_net'):
with torch.no_grad():
maskiou_p = net.maskiou_net(masks.unsqueeze(1))
maskiou_p = torch.gather(maskiou_p, dim=1, index=classes.unsqueeze(1)).squeeze(1)
if cfg.rescore_mask:
if cfg.rescore_bbox:
scores = scores * maskiou_p
else:
scores = [scores, scores * maskiou_p]
# Scale masks up to the full image
masks = F.interpolate(masks.unsqueeze(0), (h, w), mode=interpolation_mode, align_corners=False).squeeze(0)
# Binarize the masks
masks.gt_(0.5)
boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, cast=False)
boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, cast=False)
boxes = boxes.long()
if cfg.mask_type == mask_type.direct and cfg.eval_mask_branch:
# Upscale masks
full_masks = torch.zeros(masks.size(0), h, w)
for jdx in range(masks.size(0)):
x1, y1, x2, y2 = boxes[jdx, :]
mask_w = x2 - x1
mask_h = y2 - y1
# Just in case
if mask_w * mask_h <= 0 or mask_w < 0:
continue
mask = masks[jdx, :].view(1, 1, cfg.mask_size, cfg.mask_size)
mask = F.interpolate(mask, (mask_h, mask_w), mode=interpolation_mode, align_corners=False)
mask = mask.gt(0.5).float()
full_masks[jdx, y1:y2, x1:x2] = mask
masks = full_masks
return classes, scores, boxes, masks
def undo_image_transformation(img, w, h):
"""
Takes a transformed image tensor and returns a numpy ndarray that is untransformed.
Arguments w and h are the original height and width of the image.
"""
img_numpy = img.permute(1, 2, 0).cpu().numpy()
img_numpy = img_numpy[:, :, (2, 1, 0)] # To BRG
if cfg.backbone.transform.normalize:
img_numpy = (img_numpy * np.array(STD) + np.array(MEANS)) / 255.0
elif cfg.backbone.transform.subtract_means:
img_numpy = (img_numpy / 255.0 + np.array(MEANS) / 255.0).astype(np.float32)
img_numpy = img_numpy[:, :, (2, 1, 0)] # To RGB
img_numpy = np.clip(img_numpy, 0, 1)
return cv2.resize(img_numpy, (w,h))
def display_lincomb(proto_data, masks):
out_masks = torch.matmul(proto_data, masks.t())
# out_masks = cfg.mask_proto_mask_activation(out_masks)
for kdx in range(1):
jdx = kdx + 0
import matplotlib.pyplot as plt
coeffs = masks[jdx, :].cpu().numpy()
idx = np.argsort(-np.abs(coeffs))
# plt.bar(list(range(idx.shape[0])), coeffs[idx])
# plt.show()
coeffs_sort = coeffs[idx]
arr_h, arr_w = (4,8)
proto_h, proto_w, _ = proto_data.size()
arr_img = np.zeros([proto_h*arr_h, proto_w*arr_w])
arr_run = np.zeros([proto_h*arr_h, proto_w*arr_w])
test = torch.sum(proto_data, -1).cpu().numpy()
for y in range(arr_h):
for x in range(arr_w):
i = arr_w * y + x
if i == 0:
running_total = proto_data[:, :, idx[i]].cpu().numpy() * coeffs_sort[i]
else:
running_total += proto_data[:, :, idx[i]].cpu().numpy() * coeffs_sort[i]
running_total_nonlin = running_total
if cfg.mask_proto_mask_activation == activation_func.sigmoid:
running_total_nonlin = (1/(1+np.exp(-running_total_nonlin)))
arr_img[y*proto_h:(y+1)*proto_h, x*proto_w:(x+1)*proto_w] = (proto_data[:, :, idx[i]] / torch.max(proto_data[:, :, idx[i]])).cpu().numpy() * coeffs_sort[i]
arr_run[y*proto_h:(y+1)*proto_h, x*proto_w:(x+1)*proto_w] = (running_total_nonlin > 0.5).astype(np.float)
plt.imshow(arr_img)
plt.show()
# plt.imshow(arr_run)
# plt.show()
# plt.imshow(test)
# plt.show()
plt.imshow(out_masks[:, :, jdx].cpu().numpy())
plt.show()
================================================
FILE: run_coco_eval.py
================================================
"""
Runs the coco-supplied cocoeval script to evaluate detections
outputted by using the output_coco_json flag in eval.py.
"""
import argparse
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
parser = argparse.ArgumentParser(description='COCO Detections Evaluator')
parser.add_argument('--bbox_det_file', default='results/bbox_detections.json', type=str)
parser.add_argument('--mask_det_file', default='results/mask_detections.json', type=str)
parser.add_argument('--gt_ann_file', default='data/coco/annotations/instances_val2017.json', type=str)
parser.add_argument('--eval_type', default='both', choices=['bbox', 'mask', 'both'], type=str)
args = parser.parse_args()
if __name__ == '__main__':
eval_bbox = (args.eval_type in ('bbox', 'both'))
eval_mask = (args.eval_type in ('mask', 'both'))
print('Loading annotations...')
gt_annotations = COCO(args.gt_ann_file)
if eval_bbox:
bbox_dets = gt_annotations.loadRes(args.bbox_det_file)
if eval_mask:
mask_dets = gt_annotations.loadRes(args.mask_det_file)
if eval_bbox:
print('\nEvaluating BBoxes:')
bbox_eval = COCOeval(gt_annotations, bbox_dets, 'bbox')
bbox_eval.evaluate()
bbox_eval.accumulate()
bbox_eval.summarize()
if eval_mask:
print('\nEvaluating Masks:')
bbox_eval = COCOeval(gt_annotations, mask_dets, 'segm')
bbox_eval.evaluate()
bbox_eval.accumulate()
bbox_eval.summarize()
================================================
FILE: scripts/augment_bbox.py
================================================
import os.path as osp
import json, pickle
import sys
from math import sqrt
from itertools import product
import torch
from numpy import random
import numpy as np
max_image_size = 550
augment_idx = 0
dump_file = 'weights/bboxes_aug.pkl'
box_file = 'weights/bboxes.pkl'
def augment_boxes(bboxes):
bboxes_rel = []
for box in bboxes:
bboxes_rel.append(prep_box(box))
bboxes_rel = np.concatenate(bboxes_rel, axis=0)
with open(dump_file, 'wb') as f:
pickle.dump(bboxes_rel, f)
def prep_box(box_list):
global augment_idx
boxes = np.array([box_list[2:]], dtype=np.float32)
# Image width and height
width, height = box_list[:2]
# To point form
boxes[:, 2:] += boxes[:, :2]
# Expand
ratio = random.uniform(1, 4)
left = random.uniform(0, width*ratio - width)
top = random.uniform(0, height*ratio - height)
height *= ratio
width *= ratio
boxes[:, :2] += (int(left), int(top))
boxes[:, 2:] += (int(left), int(top))
# RandomSampleCrop
height, width, boxes = random_sample_crop(height, width, boxes)
# RandomMirror
if random.randint(0, 2):
boxes[:, 0::2] = width - boxes[:, 2::-2]
# Resize
boxes[:, [0, 2]] *= (max_image_size / width)
boxes[:, [1, 3]] *= (max_image_size / height)
width = height = max_image_size
# ToPercentCoords
boxes[:, [0, 2]] /= width
boxes[:, [1, 3]] /= height
if augment_idx % 50000 == 0:
print('Current idx: %d' % augment_idx)
augment_idx += 1
return boxes
sample_options = (
# using entire original input image
None,
# sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
(0.1, None),
(0.3, None),
(0.7, None),
(0.9, None),
# randomly sample a patch
(None, None),
)
def intersect(box_a, box_b):
max_xy = np.minimum(box_a[:, 2:], box_b[2:])
min_xy = np.maximum(box_a[:, :2], box_b[:2])
inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
return inter[:, 0] * inter[:, 1]
def jaccard_numpy(box_a, box_b):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: Multiple bounding boxes, Shape: [num_boxes,4]
box_b: Single bounding box, Shape: [4]
Return:
jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
"""
inter = intersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])) # [A,B]
area_b = ((box_b[2]-box_b[0]) *
(box_b[3]-box_b[1])) # [A,B]
union = area_a + area_b - inter
return inter / union # [A,B]
def random_sample_crop(height, width, boxes=None):
global sample_options
while True:
# randomly choose a mode
mode = random.choice(sample_options)
if mode is None:
return height, width, boxes
min_iou, max_iou = mode
if min_iou is None:
min_iou = float('-inf')
if max_iou is None:
max_iou = float('inf')
for _ in range(50):
w = random.uniform(0.3 * width, width)
h = random.uniform(0.3 * height, height)
if h / w < 0.5 or h / w > 2:
continue
left = random.uniform(0, width - w)
top = random.uniform(0, height - h)
rect = np.array([int(left), int(top), int(left+w), int(top+h)])
overlap = jaccard_numpy(boxes, rect)
if overlap.min() < min_iou and max_iou < overlap.max():
continue
centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
mask = m1 * m2
if not mask.any():
continue
current_boxes = boxes[mask, :].copy()
current_boxes[:, :2] = np.maximum(current_boxes[:, :2], rect[:2])
current_boxes[:, :2] -= rect[:2]
current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], rect[2:])
current_boxes[:, 2:] -= rect[:2]
return h, w, current_boxes
if __name__ == '__main__':
with open(box_file, 'rb') as f:
bboxes = pickle.load(f)
augment_boxes(bboxes)
================================================
FILE: scripts/bbox_recall.py
================================================
"""
This script compiles all the bounding boxes in the training data and
clusters them for each convout resolution on which they're used.
Run this script from the Yolact root directory.
"""
import os.path as osp
import json, pickle
import sys
from math import sqrt
from itertools import product
import torch
import random
import numpy as np
dump_file = 'weights/bboxes.pkl'
aug_file = 'weights/bboxes_aug.pkl'
use_augmented_boxes = True
def intersect(box_a, box_b):
""" We resize both tensors to [A,B,2] without new malloc:
[A,2] -> [A,1,2] -> [A,B,2]
[B,2] -> [1,B,2] -> [A,B,2]
Then we compute the area of intersect between box_a and box_b.
Args:
box_a: (tensor) bounding boxes, Shape: [A,4].
box_b: (tensor) bounding boxes, Shape: [B,4].
Return:
(tensor) intersection area, Shape: [A,B].
"""
A = box_a.size(0)
B = box_b.size(0)
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
inter = torch.clamp((max_xy - min_xy), min=0)
return inter[:, :, 0] * inter[:, :, 1]
def jaccard(box_a, box_b, iscrowd=False):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes. Here we operate on
ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
Return:
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
"""
inter = intersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
area_b = ((box_b[:, 2]-box_b[:, 0]) *
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
union = area_a + area_b - inter
if iscrowd:
return inter / area_a
else:
return inter / union # [A,B]
# Also convert to point form
def to_relative(bboxes):
return np.concatenate((bboxes[:, 2:4] / bboxes[:, :2], (bboxes[:, 2:4] + bboxes[:, 4:]) / bboxes[:, :2]), axis=1)
def make_priors(conv_size, scales, aspect_ratios):
prior_data = []
conv_h = conv_size[0]
conv_w = conv_size[1]
# Iteration order is important (it has to sync up with the convout)
for j, i in product(range(conv_h), range(conv_w)):
x = (i + 0.5) / conv_w
y = (j + 0.5) / conv_h
for scale, ars in zip(scales, aspect_ratios):
for ar in ars:
w = scale * ar / conv_w
h = scale / ar / conv_h
# Point form
prior_data += [x - w/2, y - h/2, x + w/2, y + h/2]
return np.array(prior_data).reshape(-1, 4)
# fixed_ssd_config
# scales = [[3.5, 4.95], [3.6, 4.90], [3.3, 4.02], [2.7, 3.10], [2.1, 2.37], [2.1, 2.37], [1.8, 1.92]]
# aspect_ratios = [ [[1, sqrt(2), 1/sqrt(2), sqrt(3), 1/sqrt(3)][:n], [1]] for n in [3, 5, 5, 5, 3, 3, 3] ]
# conv_sizes = [(35, 35), (18, 18), (9, 9), (5, 5), (3, 3), (2, 2)]
scales = [[1.68, 2.91],
[2.95, 2.22, 0.84],
[2.23, 2.17, 3.12],
[0.76, 1.94, 2.72],
[2.10, 2.65],
[1.80, 1.92]]
aspect_ratios = [[[0.72, 0.96], [0.68, 1.17]],
[[1.28, 0.66], [0.63, 1.23], [0.89, 1.40]],
[[2.05, 1.24], [0.57, 0.83], [0.61, 1.15]],
[[1.00, 2.21], [0.47, 1.60], [1.44, 0.79]],
[[1.00, 1.41, 0.71, 1.73, 0.58], [1.08]],
[[1.00, 1.41, 0.71, 1.73, 0.58], [1.00]]]
conv_sizes = [(35, 35), (18, 18), (9, 9), (5, 5), (3, 3), (2, 2)]
# yrm33_config
# scales = [ [5.3] ] * 5
# aspect_ratios = [ [[1, 1/sqrt(2), sqrt(2)]] ]*5
# conv_sizes = [(136, 136), (67, 67), (33, 33), (16, 16), (8, 8)]
SMALL = 0
MEDIUM = 1
LARGE = 2
if __name__ == '__main__':
with open(dump_file, 'rb') as f:
bboxes = pickle.load(f)
sizes = []
smalls = []
for i in range(len(bboxes)):
area = bboxes[i][4] * bboxes[i][5]
if area < 32 ** 2:
sizes.append(SMALL)
smalls.append(area)
elif area < 96 ** 2:
sizes.append(MEDIUM)
else:
sizes.append(LARGE)
# Each box is in the form [im_w, im_h, pos_x, pos_y, size_x, size_y]
if use_augmented_boxes:
with open(aug_file, 'rb') as f:
bboxes_rel = pickle.load(f)
else:
bboxes_rel = to_relative(np.array(bboxes))
with torch.no_grad():
sizes = torch.Tensor(sizes)
anchors = [make_priors(cs, s, ar) for cs, s, ar in zip(conv_sizes, scales, aspect_ratios)]
anchors = np.concatenate(anchors, axis=0)
anchors = torch.Tensor(anchors).cuda()
bboxes_rel = torch.Tensor(bboxes_rel).cuda()
perGTAnchorMax = torch.zeros(bboxes_rel.shape[0]).cuda()
chunk_size = 1000
for i in range((bboxes_rel.size(0) // chunk_size) + 1):
start = i * chunk_size
end = min((i + 1) * chunk_size, bboxes_rel.size(0))
ious = jaccard(bboxes_rel[start:end, :], anchors)
maxes, maxidx = torch.max(ious, dim=1)
perGTAnchorMax[start:end] = maxes
hits = (perGTAnchorMax > 0.5).float()
print('Total recall: %.2f' % (torch.sum(hits) / hits.size(0) * 100))
print()
for i, metric in zip(range(3), ('small', 'medium', 'large')):
_hits = hits[sizes == i]
_size = (1 if _hits.size(0) == 0 else _hits.size(0))
print(metric + ' recall: %.2f' % ((torch.sum(_hits) / _size) * 100))
================================================
FILE: scripts/cluster_bbox_sizes.py
================================================
"""
This script compiles all the bounding boxes in the training data and
clusters them for each convout resolution on which they're used.
Run this script from the Yolact root directory.
"""
import os.path as osp
import json, pickle
import sys
import numpy as np
import sklearn.cluster as cluster
dump_file = 'weights/bboxes.pkl'
max_size = 550
num_scale_clusters = 5
num_aspect_ratio_clusters = 3
def to_relative(bboxes):
return bboxes[:, 2:4] / bboxes[:, :2]
def process(bboxes):
return to_relative(bboxes) * max_size
if __name__ == '__main__':
with open(dump_file, 'rb') as f:
bboxes = pickle.load(f)
bboxes = np.array(bboxes)
bboxes = process(bboxes)
bboxes = bboxes[(bboxes[:, 0] > 1) * (bboxes[:, 1] > 1)]
scale = np.sqrt(bboxes[:, 0] * bboxes[:, 1]).reshape(-1, 1)
clusterer = cluster.KMeans(num_scale_clusters, random_state=99, n_jobs=4)
assignments = clusterer.fit_predict(scale)
counts = np.bincount(assignments)
cluster_centers = clusterer.cluster_centers_
center_indices = list(range(num_scale_clusters))
center_indices.sort(key=lambda x: cluster_centers[x, 0])
for idx in center_indices:
center = cluster_centers[idx, 0]
boxes_for_center = bboxes[assignments == idx]
aspect_ratios = (boxes_for_center[:,0] / boxes_for_center[:,1]).reshape(-1, 1)
c = cluster.KMeans(num_aspect_ratio_clusters, random_state=idx, n_jobs=4)
ca = c.fit_predict(aspect_ratios)
cc = np.bincount(ca)
c = list(c.cluster_centers_.reshape(-1))
cidx = list(range(num_aspect_ratio_clusters))
cidx.sort(key=lambda x: -cc[x])
# import code
# code.interact(local=locals())
print('%.3f (%d) aspect ratios:' % (center, counts[idx]))
for idx in cidx:
print('\t%.2f (%d)' % (c[idx], cc[idx]))
print()
# exit()
================================================
FILE: scripts/compute_masks.py
================================================
import numpy as np
import matplotlib.pyplot as plt
import cv2
import torch
import torch.nn.functional as F
COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
(0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
def mask_iou(mask1, mask2):
"""
Inputs inputs are matricies of size _ x N. Output is size _1 x _2.
Note: if iscrowd is True, then mask2 should be the crowd.
"""
intersection = torch.matmul(mask1, mask2.t())
area1 = torch.sum(mask1, dim=1).view(1, -1)
area2 = torch.sum(mask2, dim=1).view(1, -1)
union = (area1.t() + area2) - intersection
return intersection / union
def paint_mask(img_numpy, mask, color):
h, w, _ = img_numpy.shape
img_numpy = img_numpy.copy()
mask = np.tile(mask.reshape(h, w, 1), (1, 1, 3))
color_np = np.array(color[:3]).reshape(1, 1, 3)
color_np = np.tile(color_np, (h, w, 1))
mask_color = mask * color_np
mask_alpha = 0.3
# Blend image and mask
image_crop = img_numpy * mask
img_numpy *= (1-mask)
img_numpy += image_crop * (1-mask_alpha) + mask_color * mask_alpha
return img_numpy
# Inverse sigmoid
def logit(x):
return np.log(x / (1-x + 0.0001) + 0.0001)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
img_fmt = '../data/coco/images/%012d.jpg'
with open('info.txt', 'r') as f:
img_id = int(f.read())
img = plt.imread(img_fmt % img_id).astype(np.float32)
h, w, _ = img.shape
gt_masks = np.load('gt.npy').astype(np.float32).transpose(1, 2, 0)
proto_masks = np.load('proto.npy').astype(np.float32)
proto_masks = torch.Tensor(proto_masks).permute(2, 0, 1).contiguous().unsqueeze(0)
proto_masks = F.interpolate(proto_masks, (h, w), mode='bilinear', align_corners=False).squeeze(0)
proto_masks = proto_masks.permute(1, 2, 0).numpy()
# # A x = b
ls_A = proto_masks.reshape(-1, proto_masks.shape[-1])
ls_b = gt_masks.reshape(-1, gt_masks.shape[-1])
# x is size [256, num_gt]
x = np.linalg.lstsq(ls_A, ls_b, rcond=None)[0]
approximated_masks = (np.matmul(proto_masks, x) > 0.5).astype(np.float32)
num_gt = approximated_masks.shape[2]
ious = mask_iou(torch.Tensor(approximated_masks.reshape(-1, num_gt).T),
torch.Tensor(gt_masks.reshape(-1, num_gt).T))
ious = [int(ious[i, i].item() * 100) for i in range(num_gt)]
ious.sort(key=lambda x: -x)
print(ious)
gt_img = img.copy()
for i in range(num_gt):
gt_img = paint_mask(gt_img, gt_masks[:, :, i], COLORS[i % len(COLORS)])
plt.imshow(gt_img / 255)
plt.title('GT')
plt.show()
for i in range(num_gt):
img = paint_mask(img, approximated_masks[:, :, i], COLORS[i % len(COLORS)])
plt.imshow(img / 255)
plt.title('Approximated')
plt.show()
================================================
FILE: scripts/convert_darknet.py
================================================
from backbone import DarkNetBackbone
import h5py
import torch
f = h5py.File('darknet53.h5', 'r')
m = f['model_weights']
yolo_keys = list(m.keys())
yolo_keys = [x for x in yolo_keys if len(m[x].keys()) > 0]
yolo_keys.sort()
sd = DarkNetBackbone().state_dict()
sd_keys = list(sd.keys())
sd_keys.sort()
# Note this won't work if there are 10 elements in some list but whatever that doesn't happen
layer_keys = list(set(['.'.join(x.split('.')[:-2]) for x in sd_keys]))
layer_keys.sort()
# print([x for x in sd_keys if x.startswith(layer_keys[0])])
mapping = {
'.0.weight' : ('conv2d_%d', 'kernel:0'),
'.1.bias' : ('batch_normalization_%d', 'beta:0'),
'.1.weight' : ('batch_normalization_%d', 'gamma:0'),
'.1.running_var' : ('batch_normalization_%d', 'moving_variance:0'),
'.1.running_mean': ('batch_normalization_%d', 'moving_mean:0'),
'.1.num_batches_tracked': None,
}
for i, layer_key in zip(range(1, len(layer_keys) + 1), layer_keys):
# This is pretty inefficient but I don't care
for weight_key in [x for x in sd_keys if x.startswith(layer_key)]:
diff = weight_key[len(layer_key):]
if mapping[diff] is not None:
yolo_key = mapping[diff][0] % i
sub_key = mapping[diff][1]
yolo_weight = torch.Tensor(m[yolo_key][yolo_key][sub_key].value)
if (len(yolo_weight.size()) == 4):
yolo_weight = yolo_weight.permute(3, 2, 0, 1).contiguous()
sd[weight_key] = yolo_weight
torch.save(sd, 'weights/darknet53.pth')
================================================
FILE: scripts/convert_sbd.py
================================================
import scipy.io, scipy.ndimage
import os.path, json
import pycocotools.mask
import numpy as np
def mask2bbox(mask):
rows = np.any(mask, axis=1)
cols = np.any(mask, axis=0)
rmin, rmax = np.where(rows)[0][[0, -1]]
cmin, cmax = np.where(cols)[0][[0, -1]]
return cmin, rmin, cmax - cmin, rmax - rmin
inst_path = './inst/'
img_path = './img/'
img_name_fmt = '%s.jpg'
ann_name_fmt = '%s.mat'
image_id = 1
ann_id = 1
types = ['train', 'val']
for t in types:
with open('%s.txt' % t, 'r') as f:
names = f.read().strip().split('\n')
images = []
annotations = []
for name in names:
img_name = img_name_fmt % name
ann_path = os.path.join(inst_path, ann_name_fmt % name)
ann = scipy.io.loadmat(ann_path)['GTinst'][0][0]
classes = [int(x[0]) for x in ann[2]]
seg = ann[0]
for idx in range(len(classes)):
mask = (seg == (idx + 1)).astype(np.float)
rle = pycocotools.mask.encode(np.asfortranarray(mask.astype(np.uint8)))
rle['counts'] = rle['counts'].decode('ascii')
annotations.append({
'id': ann_id,
'image_id': image_id,
'category_id': classes[idx],
'segmentation': rle,
'area': float(mask.sum()),
'bbox': [int(x) for x in mask2bbox(mask)],
'iscrowd': 0
})
ann_id += 1
img_name = img_name_fmt % name
img = scipy.ndimage.imread(os.path.join(img_path, img_name))
images.append({
'id': image_id,
'width': img.shape[1],
'height': img.shape[0],
'file_name': img_name
})
image_id += 1
info = {
'year': 2012,
'version': 1,
'description': 'Pascal SBD',
}
categories = [{'id': x+1} for x in range(20)]
with open('pascal_sbd_%s.json' % t, 'w') as f:
json.dump({
'info': info,
'images': images,
'annotations': annotations,
'licenses': {},
'categories': categories
}, f)
================================================
FILE: scripts/eval.sh
================================================
#!/bin/bash
#SBATCH -p GPU-small
#SBATCH -t 2:00:00
#SBATCH --gres=gpu:p100:1
#SBATCH --no-requeue
# Usage: ./eval.sh weights extra_args
module load python/3.6.4_gcc5_np1.14.5
module load cuda/9.0
cd $SCRATCH/yolact
python3 eval.py --trained_model=$1 --no_bar $2 > logs/eval/$(basename -- $1).log 2>&1
================================================
FILE: scripts/make_grid.py
================================================
import numpy as np
import math, random
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider, Button
fig, ax = plt.subplots()
plt.subplots_adjust(bottom=0.24)
im_handle = None
save_path = 'grid.npy'
center_x, center_y = (0.5, 0.5)
grid_w, grid_h = (35, 35)
spacing = 0
scale = 4
angle = 0
grid = None
all_grids = []
unique = False
# A hack
disable_render = False
def render():
if disable_render:
return
x = np.tile(np.array(list(range(grid_w)), dtype=np.float).reshape(1, grid_w), [grid_h, 1]) - grid_w * center_x
y = np.tile(np.array(list(range(grid_h)), dtype=np.float).reshape(grid_h, 1), [1, grid_w]) - grid_h * center_y
x /= scale
y /= scale
a1 = angle + math.pi / 3
a2 = -angle + math.pi / 3
a3 = angle
z1 = x * math.sin(a1) + y * math.cos(a1)
z2 = x * math.sin(a2) - y * math.cos(a2)
z3 = x * math.sin(a3) + y * math.cos(a3)
s1 = np.square(np.sin(z1))
s2 = np.square(np.sin(z2))
s3 = np.square(np.sin(z3))
line_1 = np.exp(s1 * spacing) * s1
line_2 = np.exp(s2 * spacing) * s2
line_3 = np.exp(s3 * spacing) * s3
global grid
grid = np.clip(1 - (line_1 + line_2 + line_3) / 3, 0, 1)
global im_handle
if im_handle is None:
im_handle = plt.imshow(grid)
else:
im_handle.set_data(grid)
fig.canvas.draw_idle()
def update_scale(val):
global scale
scale = val
render()
def update_angle(val):
global angle
angle = val
render()
def update_centerx(val):
global center_x
center_x = val
render()
def update_centery(val):
global center_y
center_y = val
render()
def update_spacing(val):
global spacing
spacing = val
render()
def randomize(val):
global center_x, center_y, spacing, scale, angle, disable_render
center_x, center_y = (random.uniform(0, 1), random.uniform(0, 1))
spacing = random.uniform(-0.2, 2)
scale = 4 * math.exp(random.uniform(-1, 1))
angle = random.uniform(-math.pi, math.pi)
disable_render = True
scale_slider.set_val(scale)
angle_slider.set_val(angle)
centx_slider.set_val(center_x)
centy_slider.set_val(center_y)
spaci_slider.set_val(spacing)
disable_render = False
render()
def add(val):
all_grids.append(grid)
global unique
if not unique:
unique = test_uniqueness(np.stack(all_grids))
export_len_text.set_text('Num Grids: ' + str(len(all_grids)))
fig.canvas.draw_idle()
def add_randomize(val):
add(val)
randomize(val)
def export(val):
np.save(save_path, np.stack(all_grids))
print('Saved %d grids to "%s"' % (len(all_grids), save_path))
global unique
unique = False
all_grids.clear()
export_len_text.set_text('Num Grids: ' + str(len(all_grids)))
fig.canvas.draw_idle()
def test_uniqueness(grids):
# Grids shape [ngrids, h, w]
grids = grids.reshape((-1, grid_h, grid_w))
for y in range(grid_h):
for x in range(grid_h):
pixel_features = grids[:, y, x]
# l1 distance for this pixel with every other
l1_dist = np.sum(np.abs(grids - np.tile(pixel_features, grid_h*grid_w).reshape((-1, grid_h, grid_w))), axis=0)
# Equal if l1 distance is really small. Note that this will include this pixel
num_equal = np.sum((l1_dist < 0.0001).astype(np.int32))
if num_equal > 1:
print('Pixel at (%d, %d) has %d other pixel%s with the same representation.' % (x, y, num_equal-1, '' if num_equal==2 else 's'))
return False
print('Each pixel has a distinct representation.')
return True
render()
axis = plt.axes([0.22, 0.19, 0.59, 0.03], facecolor='lightgoldenrodyellow')
scale_slider = Slider(axis, 'Scale', 0.1, 20, valinit=scale, valstep=0.1)
scale_slider.on_changed(update_scale)
axis = plt.axes([0.22, 0.15, 0.59, 0.03], facecolor='lightgoldenrodyellow')
angle_slider = Slider(axis, 'Angle', -math.pi, math.pi, valinit=angle, valstep=0.1)
angle_slider.on_changed(update_angle)
axis = plt.axes([0.22, 0.11, 0.59, 0.03], facecolor='lightgoldenrodyellow')
centx_slider = Slider(axis, 'Center X', 0, 1, valinit=center_x, valstep=0.05)
centx_slider.on_changed(update_centerx)
axis = plt.axes([0.22, 0.07, 0.59, 0.03], facecolor='lightgoldenrodyellow')
centy_slider = Slider(axis, 'Center Y', 0, 1, valinit=center_y, valstep=0.05)
centy_slider.on_changed(update_centery)
axis = plt.axes([0.22, 0.03, 0.59, 0.03], facecolor='lightgoldenrodyellow')
spaci_slider = Slider(axis, 'Spacing', -1, 2, valinit=spacing, valstep=0.05)
spaci_slider.on_changed(update_spacing)
axis = plt.axes([0.8, 0.54, 0.15, 0.05], facecolor='lightgoldenrodyellow')
rando_button = Button(axis, 'Randomize')
rando_button.on_clicked(randomize)
axis = plt.axes([0.8, 0.48, 0.15, 0.05], facecolor='lightgoldenrodyellow')
addgr_button = Button(axis, 'Add')
addgr_button.on_clicked(add)
# Likely not a good way to do this but whatever
export_len_text = plt.text(0, 3, 'Num Grids: 0')
axis = plt.axes([0.8, 0.42, 0.15, 0.05], facecolor='lightgoldenrodyellow')
addra_button = Button(axis, 'Add / Rand')
addra_button.on_clicked(add_randomize)
axis = plt.axes([0.8, 0.36, 0.15, 0.05], facecolor='lightgoldenrodyellow')
saveg_button = Button(axis, 'Save')
saveg_button.on_clicked(export)
plt.show()
================================================
FILE: scripts/optimize_bboxes.py
================================================
"""
Instead of clustering bbox widths and heights, this script
directly optimizes average IoU across the training set given
the specified number of anchor boxes.
Run this script from the Yolact root directory.
"""
import pickle
import random
from itertools import product
from math import sqrt
import numpy as np
import torch
from scipy.optimize import minimize
dump_file = 'weights/bboxes.pkl'
aug_file = 'weights/bboxes_aug.pkl'
use_augmented_boxes = True
def intersect(box_a, box_b):
""" We resize both tensors to [A,B,2] without new malloc:
[A,2] -> [A,1,2] -> [A,B,2]
[B,2] -> [1,B,2] -> [A,B,2]
Then we compute the area of intersect between box_a and box_b.
Args:
box_a: (tensor) bounding boxes, Shape: [A,4].
box_b: (tensor) bounding boxes, Shape: [B,4].
Return:
(tensor) intersection area, Shape: [A,B].
"""
A = box_a.size(0)
B = box_b.size(0)
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
inter = torch.clamp((max_xy - min_xy), min=0)
return inter[:, :, 0] * inter[:, :, 1]
def jaccard(box_a, box_b, iscrowd=False):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes. Here we operate on
ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
Return:
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
"""
inter = intersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
area_b = ((box_b[:, 2]-box_b[:, 0]) *
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
union = area_a + area_b - inter
if iscrowd:
return inter / area_a
else:
return inter / union # [A,B]
# Also convert to point form
def to_relative(bboxes):
return np.concatenate((bboxes[:, 2:4] / bboxes[:, :2], (bboxes[:, 2:4] + bboxes[:, 4:]) / bboxes[:, :2]), axis=1)
def make_priors(conv_size, scales, aspect_ratios):
prior_data = []
conv_h = conv_size[0]
conv_w = conv_size[1]
# Iteration order is important (it has to sync up with the convout)
for j, i in product(range(conv_h), range(conv_w)):
x = (i + 0.5) / conv_w
y = (j + 0.5) / conv_h
for scale, ars in zip(scales, aspect_ratios):
for ar in ars:
w = scale * ar / conv_w
h = scale / ar / conv_h
# Point form
prior_data += [x - w/2, y - h/2, x + w/2, y + h/2]
return torch.Tensor(prior_data).view(-1, 4).cuda()
scales = [[1.68, 2.91], [2.95, 2.22, 0.84], [2.17, 2.22, 3.22], [0.76, 2.06, 2.81], [5.33, 2.79], [13.69]]
aspect_ratios = [[[0.72, 0.96], [0.68, 1.17]], [[1.30, 0.66], [0.63, 1.23], [0.87, 1.41]], [[1.96, 1.23], [0.58, 0.84], [0.61, 1.15]], [[19.79, 2.21], [0.47, 1.76], [1.38, 0.79]], [[4.79, 17.96], [1.04]], [[14.82]]]
conv_sizes = [(35, 35), (18, 18), (9, 9), (5, 5), (3, 3), (2, 2)]
optimize_scales = False
batch_idx = 0
def compute_hits(bboxes, anchors, iou_threshold=0.5):
ious = jaccard(bboxes, anchors)
perGTAnchorMax, _ = torch.max(ious, dim=1)
return (perGTAnchorMax > iou_threshold)
def compute_recall(hits, base_hits):
hits = (hits | base_hits).float()
return torch.sum(hits) / hits.size(0)
def step(x, x_func, bboxes, base_hits, optim_idx):
# This should set the scale and aspect ratio
x_func(x, scales[optim_idx], aspect_ratios[optim_idx])
anchors = make_priors(conv_sizes[optim_idx], scales[optim_idx], aspect_ratios[optim_idx])
return -float(compute_recall(compute_hits(bboxes, anchors), base_hits).cpu())
def optimize(full_bboxes, optim_idx, batch_size=5000):
global batch_idx, scales, aspect_ratios, conv_sizes
start = batch_idx * batch_size
end = min((batch_idx + 1) * batch_size, full_bboxes.size(0))
if batch_idx > (full_bboxes.size(0) // batch_size):
batch_idx = 0
bboxes = full_bboxes[start:end, :]
anchor_base = [
make_priors(conv_sizes[idx], scales[idx], aspect_ratios[idx])
for idx in range(len(conv_sizes)) if idx != optim_idx]
base_hits = compute_hits(bboxes, torch.cat(anchor_base, dim=0))
def set_x(x, scales, aspect_ratios):
if optimize_scales:
for i in range(len(scales)):
scales[i] = max(x[i], 0)
else:
k = 0
for i in range(len(aspect_ratios)):
for j in range(len(aspect_ratios[i])):
aspect_ratios[i][j] = x[k]
k += 1
res = minimize(step, x0=scales[optim_idx] if optimize_scales else sum(aspect_ratios[optim_idx], []), method='Powell',
args = (set_x, bboxes, base_hits, optim_idx),)
def pretty_str(x:list):
if isinstance(x, list):
return '[' + ', '.join([pretty_str(y) for y in x]) + ']'
elif isinstance(x, np.ndarray):
return pretty_str(list(x))
else:
return '%.2f' % x
if __name__ == '__main__':
if use_augmented_boxes:
with open(aug_file, 'rb') as f:
bboxes = pickle.load(f)
else:
# Load widths and heights from a dump file. Obtain this with
# python3 scripts/save_bboxes.py
with open(dump_file, 'rb') as f:
bboxes = pickle.load(f)
bboxes = np.array(bboxes)
bboxes = to_relative(bboxes)
with torch.no_grad():
bboxes = torch.Tensor(bboxes).cuda()
def print_out():
if optimize_scales:
print('Scales: ' + pretty_str(scales))
else:
print('Aspect Ratios: ' + pretty_str(aspect_ratios))
for p in range(10):
print('(Sub Iteration) ', end='')
for i in range(len(conv_sizes)):
print('%d ' % i, end='', flush=True)
optimize(bboxes, i)
print('Done', end='\r')
print('(Iteration %d) ' % p, end='')
print_out()
print()
optimize_scales = not optimize_scales
print('scales = ' + pretty_str(scales))
print('aspect_ratios = ' + pretty_str(aspect_ratios))
================================================
FILE: scripts/parse_eval.py
================================================
import re, sys, os
import matplotlib.pyplot as plt
from matplotlib._color_data import XKCD_COLORS
with open(sys.argv[1], 'r') as f:
txt = f.read()
txt, overall = txt.split('overall performance')
class_names = []
mAP_overall = []
mAP_small = []
mAP_medium = []
mAP_large = []
for class_result in txt.split('evaluate category: ')[1:]:
lines = class_result.split('\n')
class_names.append(lines[0])
def grabMAP(string):
return float(string.split('] = ')[1]) * 100
mAP_overall.append(grabMAP(lines[ 7]))
mAP_small .append(grabMAP(lines[10]))
mAP_medium .append(grabMAP(lines[11]))
mAP_large .append(grabMAP(lines[12]))
mAP_map = {
'small': mAP_small,
'medium': mAP_medium,
'large': mAP_large,
}
if len(sys.argv) > 2:
bars = plt.bar(class_names, mAP_map[sys.argv[2]])
plt.title(sys.argv[2] + ' mAP per class')
else:
bars = plt.bar(class_names, mAP_overall)
plt.title('overall mAP per class')
colors = list(XKCD_COLORS.values())
for idx, bar in enumerate(bars):
# Mmm pseudorandom colors
char_sum = sum([ord(char) for char in class_names[idx]])
bar.set_color(colors[char_sum % len(colors)])
plt.xticks(rotation='vertical')
plt.show()
================================================
FILE: scripts/plot_loss.py
================================================
import re, sys, os
import matplotlib.pyplot as plt
from utils.functions import MovingAverage
with open(sys.argv[1], 'r') as f:
inp = f.read()
patterns = {
'train': re.compile(r'\[\s*(?P\d+)\]\s*(?P\d+) \|\| B: (?P\S+) \| C: (?P\S+) \| M: (?P\S+) \|( S: (?P\S+) \|)? T: (?P\S+)'),
'val': re.compile(r'\s*(?P[a-z]+) \|\s*(?P\S+)')
}
data = {key: [] for key in patterns}
for line in inp.split('\n'):
for key, pattern in patterns.items():
f = pattern.search(line)
if f is not None:
datum = f.groupdict()
for k, v in datum.items():
if v is not None:
try:
v = float(v)
except ValueError:
pass
datum[k] = v
if key == 'val':
datum = (datum, data['train'][-1])
data[key].append(datum)
break
def smoother(y, interval=100):
avg = MovingAverage(interval)
for i in range(len(y)):
avg.append(y[i])
y[i] = avg.get_avg()
return y
def plot_train(data):
plt.title(os.path.basename(sys.argv[1]) + ' Training Loss')
plt.xlabel('Iteration')
plt.ylabel('Loss')
loss_names = ['BBox Loss', 'Conf Loss', 'Mask Loss']
x = [x['iteration'] for x in data]
plt.plot(x, smoother([y['b'] for y in data]))
plt.plot(x, smoother([y['c'] for y in data]))
plt.plot(x, smoother([y['m'] for y in data]))
if data[0]['s'] is not None:
plt.plot(x, smoother([y['s'] for y in data]))
loss_names.append('Segmentation Loss')
plt.legend(loss_names)
plt.show()
def plot_val(data):
plt.title(os.path.basename(sys.argv[1]) + ' Validation mAP')
plt.xlabel('Epoch')
plt.ylabel('mAP')
x = [x[1]['epoch'] for x in data if x[0]['type'] == 'box']
plt.plot(x, [x[0]['all'] for x in data if x[0]['type'] == 'box'])
plt.plot(x, [x[0]['all'] for x in data if x[0]['type'] == 'mask'])
plt.legend(['BBox mAP', 'Mask mAP'])
plt.show()
if len(sys.argv) > 2 and sys.argv[2] == 'val':
plot_val(data['val'])
else:
plot_train(data['train'])
================================================
FILE: scripts/resume.sh
================================================
#!/bin/bash
#SBATCH -p GPU-shared
#SBATCH -t 48:00:00
#SBATCH --gres=gpu:p100:1
#SBATCH --no-requeue
# Usage: ./resume.sh config batch_size resume_file
module load python/3.6.4_gcc5_np1.14.5
module load cuda/9.0
cd $SCRATCH/yolact
python3 train.py --config $1 --batch_size $2 --resume=$3 --save_interval 5000 --start_iter=-1 >>logs/$1_log 2>&1
================================================
FILE: scripts/save_bboxes.py
================================================
""" This script transforms and saves bbox coordinates into a pickle object for easy loading. """
import os.path as osp
import json, pickle
import sys
import numpy as np
COCO_ROOT = osp.join('.', 'data/coco/')
annotation_file = 'instances_train2017.json'
annotation_path = osp.join(COCO_ROOT, 'annotations/', annotation_file)
dump_file = 'weights/bboxes.pkl'
with open(annotation_path, 'r') as f:
annotations_json = json.load(f)
annotations = annotations_json['annotations']
images = annotations_json['images']
images = {image['id']: image for image in images}
bboxes = []
for ann in annotations:
image = images[ann['image_id']]
w,h = (image['width'], image['height'])
if 'bbox' in ann:
bboxes.append([w, h] + ann['bbox'])
with open(dump_file, 'wb') as f:
pickle.dump(bboxes, f)
================================================
FILE: scripts/train.sh
================================================
#!/bin/bash
#SBATCH -p GPU-shared
#SBATCH -t 48:00:00
#SBATCH --gres=gpu:p100:1
#SBATCH --no-requeue
# Usage: ./train.sh config batch_size
module load python/3.6.4_gcc5_np1.14.5
module load cuda/9.0
cd $SCRATCH/yolact
python3 train.py --config $1 --batch_size $2 --save_interval 5000 &>logs/$1_log
================================================
FILE: scripts/unpack_statedict.py
================================================
import torch
import sys, os
# Usage python scripts/unpack_statedict.py path_to_pth out_folder/
# Make sure to include that slash after your out folder, since I can't
# be arsed to do path concatenation so I'd rather type out this comment
print('Loading state dict...')
state = torch.load(sys.argv[1])
if not os.path.exists(sys.argv[2]):
os.mkdir(sys.argv[2])
print('Saving stuff...')
for key, val in state.items():
torch.save(val, sys.argv[2] + key)
================================================
FILE: train.py
================================================
from data import *
from utils.augmentations import SSDAugmentation, BaseTransform
from utils.functions import MovingAverage, SavePath
from utils.logger import Log
from utils import timer
from layers.modules import MultiBoxLoss
from yolact import Yolact
import os
import sys
import time
import math, random
from pathlib import Path
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import torch.utils.data as data
import numpy as np
import argparse
import datetime
# Oof
import eval as eval_script
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
parser = argparse.ArgumentParser(
description='Yolact Training Script')
parser.add_argument('--batch_size', default=8, type=int,
help='Batch size for training')
parser.add_argument('--resume', default=None, type=str,
help='Checkpoint state_dict file to resume training from. If this is "interrupt"'\
', the model will resume training from the interrupt file.')
parser.add_argument('--start_iter', default=-1, type=int,
help='Resume training at this iter. If this is -1, the iteration will be'\
'determined from the file name.')
parser.add_argument('--num_workers', default=4, type=int,
help='Number of workers used in dataloading')
parser.add_argument('--cuda', default=True, type=str2bool,
help='Use CUDA to train model')
parser.add_argument('--lr', '--learning_rate', default=None, type=float,
help='Initial learning rate. Leave as None to read this from the config.')
parser.add_argument('--momentum', default=None, type=float,
help='Momentum for SGD. Leave as None to read this from the config.')
parser.add_argument('--decay', '--weight_decay', default=None, type=float,
help='Weight decay for SGD. Leave as None to read this from the config.')
parser.add_argument('--gamma', default=None, type=float,
help='For each lr step, what to multiply the lr by. Leave as None to read this from the config.')
parser.add_argument('--save_folder', default='weights/',
help='Directory for saving checkpoint models.')
parser.add_argument('--log_folder', default='logs/',
help='Directory for saving logs.')
parser.add_argument('--config', default=None,
help='The config object to use.')
parser.add_argument('--save_interval', default=10000, type=int,
help='The number of iterations between saving the model.')
parser.add_argument('--validation_size', default=5000, type=int,
help='The number of images to use for validation.')
parser.add_argument('--validation_epoch', default=2, type=int,
help='Output validation information every n iterations. If -1, do no validation.')
parser.add_argument('--keep_latest', dest='keep_latest', action='store_true',
help='Only keep the latest checkpoint instead of each one.')
parser.add_argument('--keep_latest_interval', default=100000, type=int,
help='When --keep_latest is on, don\'t delete the latest file at these intervals. This should be a multiple of save_interval or 0.')
parser.add_argument('--dataset', default=None, type=str,
help='If specified, override the dataset specified in the config with this one (example: coco2017_dataset).')
parser.add_argument('--no_log', dest='log', action='store_false',
help='Don\'t log per iteration information into log_folder.')
parser.add_argument('--log_gpu', dest='log_gpu', action='store_true',
help='Include GPU information in the logs. Nvidia-smi tends to be slow, so set this with caution.')
parser.add_argument('--no_interrupt', dest='interrupt', action='store_false',
help='Don\'t save an interrupt when KeyboardInterrupt is caught.')
parser.add_argument('--batch_alloc', default=None, type=str,
help='If using multiple GPUS, you can set this to be a comma separated list detailing which GPUs should get what local batch size (It should add up to your total batch size).')
parser.add_argument('--no_autoscale', dest='autoscale', action='store_false',
help='YOLACT will automatically scale the lr and the number of iterations depending on the batch size. Set this if you want to disable that.')
parser.set_defaults(keep_latest=False, log=True, log_gpu=False, interrupt=True, autoscale=True)
args = parser.parse_args()
if args.config is not None:
set_cfg(args.config)
if args.dataset is not None:
set_dataset(args.dataset)
if args.autoscale and args.batch_size != 8:
factor = args.batch_size / 8
if __name__ == '__main__':
print('Scaling parameters by %.2f to account for a batch size of %d.' % (factor, args.batch_size))
cfg.lr *= factor
cfg.max_iter //= factor
cfg.lr_steps = [x // factor for x in cfg.lr_steps]
# Update training parameters from the config if necessary
def replace(name):
if getattr(args, name) == None: setattr(args, name, getattr(cfg, name))
replace('lr')
replace('decay')
replace('gamma')
replace('momentum')
# This is managed by set_lr
cur_lr = args.lr
if torch.cuda.device_count() == 0:
print('No GPUs detected. Exiting...')
exit(-1)
if args.batch_size // torch.cuda.device_count() < 6:
if __name__ == '__main__':
print('Per-GPU batch size is less than the recommended limit for batch norm. Disabling batch norm.')
cfg.freeze_bn = True
loss_types = ['B', 'C', 'M', 'P', 'D', 'E', 'S', 'I']
if torch.cuda.is_available():
if args.cuda:
torch.set_default_tensor_type('torch.cuda.FloatTensor')
if not args.cuda:
print("WARNING: It looks like you have a CUDA device, but aren't " +
"using CUDA.\nRun with --cuda for optimal training speed.")
torch.set_default_tensor_type('torch.FloatTensor')
else:
torch.set_default_tensor_type('torch.FloatTensor')
class NetLoss(nn.Module):
"""
A wrapper for running the network and computing the loss
This is so we can more efficiently use DataParallel.
"""
def __init__(self, net:Yolact, criterion:MultiBoxLoss):
super().__init__()
self.net = net
self.criterion = criterion
def forward(self, images, targets, masks, num_crowds):
preds = self.net(images)
losses = self.criterion(self.net, preds, targets, masks, num_crowds)
return losses
class CustomDataParallel(nn.DataParallel):
"""
This is a custom version of DataParallel that works better with our training data.
It should also be faster than the general case.
"""
def scatter(self, inputs, kwargs, device_ids):
# More like scatter and data prep at the same time. The point is we prep the data in such a way
# that no scatter is necessary, and there's no need to shuffle stuff around different GPUs.
devices = ['cuda:' + str(x) for x in device_ids]
splits = prepare_data(inputs[0], devices, allocation=args.batch_alloc)
return [[split[device_idx] for split in splits] for device_idx in range(len(devices))], \
[kwargs] * len(devices)
def gather(self, outputs, output_device):
out = {}
for k in outputs[0]:
out[k] = torch.stack([output[k].to(output_device) for output in outputs])
return out
def train():
if not os.path.exists(args.save_folder):
os.mkdir(args.save_folder)
dataset = COCODetection(image_path=cfg.dataset.train_images,
info_file=cfg.dataset.train_info,
transform=SSDAugmentation(MEANS))
if args.validation_epoch > 0:
setup_eval()
val_dataset = COCODetection(image_path=cfg.dataset.valid_images,
info_file=cfg.dataset.valid_info,
transform=BaseTransform(MEANS))
# Parallel wraps the underlying module, but when saving and loading we don't want that
yolact_net = Yolact()
net = yolact_net
net.train()
if args.log:
log = Log(cfg.name, args.log_folder, dict(args._get_kwargs()),
overwrite=(args.resume is None), log_gpu_stats=args.log_gpu)
# I don't use the timer during training (I use a different timing method).
# Apparently there's a race condition with multiple GPUs, so disable it just to be safe.
timer.disable_all()
# Both of these can set args.resume to None, so do them before the check
if args.resume == 'interrupt':
args.resume = SavePath.get_interrupt(args.save_folder)
elif args.resume == 'latest':
args.resume = SavePath.get_latest(args.save_folder, cfg.name)
if args.resume is not None:
print('Resuming training, loading {}...'.format(args.resume))
yolact_net.load_weights(args.resume)
if args.start_iter == -1:
args.start_iter = SavePath.from_str(args.resume).iteration
else:
print('Initializing weights...')
yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path)
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
weight_decay=args.decay)
criterion = MultiBoxLoss(num_classes=cfg.num_classes,
pos_threshold=cfg.positive_iou_threshold,
neg_threshold=cfg.negative_iou_threshold,
negpos_ratio=cfg.ohem_negpos_ratio)
if args.batch_alloc is not None:
args.batch_alloc = [int(x) for x in args.batch_alloc.split(',')]
if sum(args.batch_alloc) != args.batch_size:
print('Error: Batch allocation (%s) does not sum to batch size (%s).' % (args.batch_alloc, args.batch_size))
exit(-1)
net = CustomDataParallel(NetLoss(net, criterion))
if args.cuda:
net = net.cuda()
# Initialize everything
if not cfg.freeze_bn: yolact_net.freeze_bn() # Freeze bn so we don't kill our means
yolact_net(torch.zeros(1, 3, cfg.max_size, cfg.max_size).cuda())
if not cfg.freeze_bn: yolact_net.freeze_bn(True)
# loss counters
loc_loss = 0
conf_loss = 0
iteration = max(args.start_iter, 0)
last_time = time.time()
epoch_size = len(dataset) // args.batch_size
num_epochs = math.ceil(cfg.max_iter / epoch_size)
# Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index
step_index = 0
data_loader = data.DataLoader(dataset, args.batch_size,
num_workers=args.num_workers,
shuffle=True, collate_fn=detection_collate,
pin_memory=True)
save_path = lambda epoch, iteration: SavePath(cfg.name, epoch, iteration).get_path(root=args.save_folder)
time_avg = MovingAverage()
global loss_types # Forms the print order
loss_avgs = { k: MovingAverage(100) for k in loss_types }
print('Begin training!')
print()
# try-except so you can use ctrl+c to save early and stop training
try:
for epoch in range(num_epochs):
# Resume from start_iter
if (epoch+1)*epoch_size < iteration:
continue
for datum in data_loader:
# Stop if we've reached an epoch if we're resuming from start_iter
if iteration == (epoch+1)*epoch_size:
break
# Stop at the configured number of iterations even if mid-epoch
if iteration == cfg.max_iter:
break
# Change a config setting if we've reached the specified iteration
changed = False
for change in cfg.delayed_settings:
if iteration >= change[0]:
changed = True
cfg.replace(change[1])
# Reset the loss averages because things might have changed
for avg in loss_avgs:
avg.reset()
# If a config setting was changed, remove it from the list so we don't keep checking
if changed:
cfg.delayed_settings = [x for x in cfg.delayed_settings if x[0] > iteration]
# Warm up by linearly interpolating the learning rate from some smaller value
if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until:
set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init)
# Adjust the learning rate at the given iterations, but also if we resume from past that iteration
while step_index < len(cfg.lr_steps) and iteration >= cfg.lr_steps[step_index]:
step_index += 1
set_lr(optimizer, args.lr * (args.gamma ** step_index))
# Zero the grad to get ready to compute gradients
optimizer.zero_grad()
# Forward Pass + Compute loss at the same time (see CustomDataParallel and NetLoss)
losses = net(datum)
losses = { k: (v).mean() for k,v in losses.items() } # Mean here because Dataparallel
loss = sum([losses[k] for k in losses])
# no_inf_mean removes some components from the loss, so make sure to backward through all of it
# all_loss = sum([v.mean() for v in losses.values()])
# Backprop
loss.backward() # Do this to free up vram even if loss is not finite
if torch.isfinite(loss).item():
optimizer.step()
# Add the loss to the moving average for bookkeeping
for k in losses:
loss_avgs[k].add(losses[k].item())
cur_time = time.time()
elapsed = cur_time - last_time
last_time = cur_time
# Exclude graph setup from the timing information
if iteration != args.start_iter:
time_avg.add(elapsed)
if iteration % 10 == 0:
eta_str = str(datetime.timedelta(seconds=(cfg.max_iter-iteration) * time_avg.get_avg())).split('.')[0]
total = sum([loss_avgs[k].get_avg() for k in losses])
loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], [])
print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f')
% tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True)
if args.log:
precision = 5
loss_info = {k: round(losses[k].item(), precision) for k in losses}
loss_info['T'] = round(loss.item(), precision)
if args.log_gpu:
log.log_gpu_stats = (iteration % 10 == 0) # nvidia-smi is sloooow
log.log('train', loss=loss_info, epoch=epoch, iter=iteration,
lr=round(cur_lr, 10), elapsed=elapsed)
log.log_gpu_stats = args.log_gpu
iteration += 1
if iteration % args.save_interval == 0 and iteration != args.start_iter:
if args.keep_latest:
latest = SavePath.get_latest(args.save_folder, cfg.name)
print('Saving state, iter:', iteration)
yolact_net.save_weights(save_path(epoch, iteration))
if args.keep_latest and latest is not None:
if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval:
print('Deleting old save...')
os.remove(latest)
# This is done per epoch
if args.validation_epoch > 0:
if epoch % args.validation_epoch == 0 and epoch > 0:
compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None)
# Compute validation mAP after training is finished
compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None)
except KeyboardInterrupt:
if args.interrupt:
print('Stopping early. Saving network...')
# Delete previous copy of the interrupted network so we don't spam the weights folder
SavePath.remove_interrupt(args.save_folder)
yolact_net.save_weights(save_path(epoch, repr(iteration) + '_interrupt'))
exit()
yolact_net.save_weights(save_path(epoch, iteration))
def set_lr(optimizer, new_lr):
for param_group in optimizer.param_groups:
param_group['lr'] = new_lr
global cur_lr
cur_lr = new_lr
def gradinator(x):
x.requires_grad = False
return x
def prepare_data(datum, devices:list=None, allocation:list=None):
with torch.no_grad():
if devices is None:
devices = ['cuda:0'] if args.cuda else ['cpu']
if allocation is None:
allocation = [args.batch_size // len(devices)] * (len(devices) - 1)
allocation.append(args.batch_size - sum(allocation)) # The rest might need more/less
images, (targets, masks, num_crowds) = datum
cur_idx = 0
for device, alloc in zip(devices, allocation):
for _ in range(alloc):
images[cur_idx] = gradinator(images[cur_idx].to(device))
targets[cur_idx] = gradinator(targets[cur_idx].to(device))
masks[cur_idx] = gradinator(masks[cur_idx].to(device))
cur_idx += 1
if cfg.preserve_aspect_ratio:
# Choose a random size from the batch
_, h, w = images[random.randint(0, len(images)-1)].size()
for idx, (image, target, mask, num_crowd) in enumerate(zip(images, targets, masks, num_crowds)):
images[idx], targets[idx], masks[idx], num_crowds[idx] \
= enforce_size(image, target, mask, num_crowd, w, h)
cur_idx = 0
split_images, split_targets, split_masks, split_numcrowds \
= [[None for alloc in allocation] for _ in range(4)]
for device_idx, alloc in enumerate(allocation):
split_images[device_idx] = torch.stack(images[cur_idx:cur_idx+alloc], dim=0)
split_targets[device_idx] = targets[cur_idx:cur_idx+alloc]
split_masks[device_idx] = masks[cur_idx:cur_idx+alloc]
split_numcrowds[device_idx] = num_crowds[cur_idx:cur_idx+alloc]
cur_idx += alloc
return split_images, split_targets, split_masks, split_numcrowds
def no_inf_mean(x:torch.Tensor):
"""
Computes the mean of a vector, throwing out all inf values.
If there are no non-inf values, this will return inf (i.e., just the normal mean).
"""
no_inf = [a for a in x if torch.isfinite(a)]
if len(no_inf) > 0:
return sum(no_inf) / len(no_inf)
else:
return x.mean()
def compute_validation_loss(net, data_loader, criterion):
global loss_types
with torch.no_grad():
losses = {}
# Don't switch to eval mode because we want to get losses
iterations = 0
for datum in data_loader:
images, targets, masks, num_crowds = prepare_data(datum)
out = net(images)
wrapper = ScatterWrapper(targets, masks, num_crowds)
_losses = criterion(out, wrapper, wrapper.make_mask())
for k, v in _losses.items():
v = v.mean().item()
if k in losses:
losses[k] += v
else:
losses[k] = v
iterations += 1
if args.validation_size <= iterations * args.batch_size:
break
for k in losses:
losses[k] /= iterations
loss_labels = sum([[k, losses[k]] for k in loss_types if k in losses], [])
print(('Validation ||' + (' %s: %.3f |' * len(losses)) + ')') % tuple(loss_labels), flush=True)
def compute_validation_map(epoch, iteration, yolact_net, dataset, log:Log=None):
with torch.no_grad():
yolact_net.eval()
start = time.time()
print()
print("Computing validation mAP (this may take a while)...", flush=True)
val_info = eval_script.evaluate(yolact_net, dataset, train_mode=True)
end = time.time()
if log is not None:
log.log('val', val_info, elapsed=(end - start), epoch=epoch, iter=iteration)
yolact_net.train()
def setup_eval():
eval_script.parse_args(['--no_bar', '--max_images='+str(args.validation_size)])
if __name__ == '__main__':
train()
================================================
FILE: utils/__init__.py
================================================
from .augmentations import SSDAugmentation
================================================
FILE: utils/augmentations.py
================================================
import torch
from torchvision import transforms
import cv2
import numpy as np
import types
from numpy import random
from math import sqrt
from data import cfg, MEANS, STD
def intersect(box_a, box_b):
max_xy = np.minimum(box_a[:, 2:], box_b[2:])
min_xy = np.maximum(box_a[:, :2], box_b[:2])
inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
return inter[:, 0] * inter[:, 1]
def jaccard_numpy(box_a, box_b):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: Multiple bounding boxes, Shape: [num_boxes,4]
box_b: Single bounding box, Shape: [4]
Return:
jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
"""
inter = intersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])) # [A,B]
area_b = ((box_b[2]-box_b[0]) *
(box_b[3]-box_b[1])) # [A,B]
union = area_a + area_b - inter
return inter / union # [A,B]
class Compose(object):
"""Composes several augmentations together.
Args:
transforms (List[Transform]): list of transforms to compose.
Example:
>>> augmentations.Compose([
>>> transforms.CenterCrop(10),
>>> transforms.ToTensor(),
>>> ])
"""
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, img, masks=None, boxes=None, labels=None):
for t in self.transforms:
img, masks, boxes, labels = t(img, masks, boxes, labels)
return img, masks, boxes, labels
class Lambda(object):
"""Applies a lambda as a transform."""
def __init__(self, lambd):
assert isinstance(lambd, types.LambdaType)
self.lambd = lambd
def __call__(self, img, masks=None, boxes=None, labels=None):
return self.lambd(img, masks, boxes, labels)
class ConvertFromInts(object):
def __call__(self, image, masks=None, boxes=None, labels=None):
return image.astype(np.float32), masks, boxes, labels
class ToAbsoluteCoords(object):
def __call__(self, image, masks=None, boxes=None, labels=None):
height, width, channels = image.shape
boxes[:, 0] *= width
boxes[:, 2] *= width
boxes[:, 1] *= height
boxes[:, 3] *= height
return image, masks, boxes, labels
class ToPercentCoords(object):
def __call__(self, image, masks=None, boxes=None, labels=None):
height, width, channels = image.shape
boxes[:, 0] /= width
boxes[:, 2] /= width
boxes[:, 1] /= height
boxes[:, 3] /= height
return image, masks, boxes, labels
class Pad(object):
"""
Pads the image to the input width and height, filling the
background with mean and putting the image in the top-left.
Note: this expects im_w <= width and im_h <= height
"""
def __init__(self, width, height, mean=MEANS, pad_gt=True):
self.mean = mean
self.width = width
self.height = height
self.pad_gt = pad_gt
def __call__(self, image, masks, boxes=None, labels=None):
im_h, im_w, depth = image.shape
expand_image = np.zeros(
(self.height, self.width, depth),
dtype=image.dtype)
expand_image[:, :, :] = self.mean
expand_image[:im_h, :im_w] = image
if self.pad_gt:
expand_masks = np.zeros(
(masks.shape[0], self.height, self.width),
dtype=masks.dtype)
expand_masks[:,:im_h,:im_w] = masks
masks = expand_masks
return expand_image, masks, boxes, labels
class Resize(object):
""" If preserve_aspect_ratio is true, this resizes to an approximate area of max_size * max_size """
@staticmethod
def calc_size_preserve_ar(img_w, img_h, max_size):
""" I mathed this one out on the piece of paper. Resulting width*height = approx max_size^2 """
ratio = sqrt(img_w / img_h)
w = max_size * ratio
h = max_size / ratio
return int(w), int(h)
def __init__(self, resize_gt=True):
self.resize_gt = resize_gt
self.max_size = cfg.max_size
self.preserve_aspect_ratio = cfg.preserve_aspect_ratio
def __call__(self, image, masks, boxes, labels=None):
img_h, img_w, _ = image.shape
if self.preserve_aspect_ratio:
width, height = Resize.calc_size_preserve_ar(img_w, img_h, self.max_size)
else:
width, height = self.max_size, self.max_size
image = cv2.resize(image, (width, height))
if self.resize_gt:
# Act like each object is a color channel
masks = masks.transpose((1, 2, 0))
masks = cv2.resize(masks, (width, height))
# OpenCV resizes a (w,h,1) array to (s,s), so fix that
if len(masks.shape) == 2:
masks = np.expand_dims(masks, 0)
else:
masks = masks.transpose((2, 0, 1))
# Scale bounding boxes (which are currently absolute coordinates)
boxes[:, [0, 2]] *= (width / img_w)
boxes[:, [1, 3]] *= (height / img_h)
# Discard boxes that are smaller than we'd like
w = boxes[:, 2] - boxes[:, 0]
h = boxes[:, 3] - boxes[:, 1]
keep = (w > cfg.discard_box_width) * (h > cfg.discard_box_height)
masks = masks[keep]
boxes = boxes[keep]
labels['labels'] = labels['labels'][keep]
labels['num_crowds'] = (labels['labels'] < 0).sum()
return image, masks, boxes, labels
class RandomSaturation(object):
def __init__(self, lower=0.5, upper=1.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
image[:, :, 1] *= random.uniform(self.lower, self.upper)
return image, masks, boxes, labels
class RandomHue(object):
def __init__(self, delta=18.0):
assert delta >= 0.0 and delta <= 360.0
self.delta = delta
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
image[:, :, 0] += random.uniform(-self.delta, self.delta)
image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
return image, masks, boxes, labels
class RandomLightingNoise(object):
def __init__(self):
self.perms = ((0, 1, 2), (0, 2, 1),
(1, 0, 2), (1, 2, 0),
(2, 0, 1), (2, 1, 0))
def __call__(self, image, masks=None, boxes=None, labels=None):
# Don't shuffle the channels please, why would you do this
# if random.randint(2):
# swap = self.perms[random.randint(len(self.perms))]
# shuffle = SwapChannels(swap) # shuffle channels
# image = shuffle(image)
return image, masks, boxes, labels
class ConvertColor(object):
def __init__(self, current='BGR', transform='HSV'):
self.transform = transform
self.current = current
def __call__(self, image, masks=None, boxes=None, labels=None):
if self.current == 'BGR' and self.transform == 'HSV':
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
elif self.current == 'HSV' and self.transform == 'BGR':
image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
else:
raise NotImplementedError
return image, masks, boxes, labels
class RandomContrast(object):
def __init__(self, lower=0.5, upper=1.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
# expects float image
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
alpha = random.uniform(self.lower, self.upper)
image *= alpha
return image, masks, boxes, labels
class RandomBrightness(object):
def __init__(self, delta=32):
assert delta >= 0.0
assert delta <= 255.0
self.delta = delta
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
delta = random.uniform(-self.delta, self.delta)
image += delta
return image, masks, boxes, labels
class ToCV2Image(object):
def __call__(self, tensor, masks=None, boxes=None, labels=None):
return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), masks, boxes, labels
class ToTensor(object):
def __call__(self, cvimage, masks=None, boxes=None, labels=None):
return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), masks, boxes, labels
class RandomSampleCrop(object):
"""Crop
Arguments:
img (Image): the image being input during training
boxes (Tensor): the original bounding boxes in pt form
labels (Tensor): the class labels for each bbox
mode (float tuple): the min and max jaccard overlaps
Return:
(img, boxes, classes)
img (Image): the cropped image
boxes (Tensor): the adjusted bounding boxes in pt form
labels (Tensor): the class labels for each bbox
"""
def __init__(self):
self.sample_options = (
# using entire original input image
None,
# sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
(0.1, None),
(0.3, None),
(0.7, None),
(0.9, None),
# randomly sample a patch
(None, None),
)
def __call__(self, image, masks, boxes=None, labels=None):
height, width, _ = image.shape
while True:
# randomly choose a mode
mode = random.choice(self.sample_options)
if mode is None:
return image, masks, boxes, labels
min_iou, max_iou = mode
if min_iou is None:
min_iou = float('-inf')
if max_iou is None:
max_iou = float('inf')
# max trails (50)
for _ in range(50):
current_image = image
w = random.uniform(0.3 * width, width)
h = random.uniform(0.3 * height, height)
# aspect ratio constraint b/t .5 & 2
if h / w < 0.5 or h / w > 2:
continue
left = random.uniform(width - w)
top = random.uniform(height - h)
# convert to integer rect x1,y1,x2,y2
rect = np.array([int(left), int(top), int(left+w), int(top+h)])
# calculate IoU (jaccard overlap) b/t the cropped and gt boxes
overlap = jaccard_numpy(boxes, rect)
# This piece of code is bugged and does nothing:
# https://github.com/amdegroot/ssd.pytorch/issues/68
#
# However, when I fixed it with overlap.max() < min_iou,
# it cut the mAP in half (after 8k iterations). So it stays.
#
# is min and max overlap constraint satisfied? if not try again
if overlap.min() < min_iou and max_iou < overlap.max():
continue
# cut the crop from the image
current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
:]
# keep overlap with gt box IF center in sampled patch
centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
# mask in all gt boxes that above and to the left of centers
m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
# mask in all gt boxes that under and to the right of centers
m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
# mask in that both m1 and m2 are true
mask = m1 * m2
# [0 ... 0 for num_gt and then 1 ... 1 for num_crowds]
num_crowds = labels['num_crowds']
crowd_mask = np.zeros(mask.shape, dtype=np.int32)
if num_crowds > 0:
crowd_mask[-num_crowds:] = 1
# have any valid boxes? try again if not
# Also make sure you have at least one regular gt
if not mask.any() or np.sum(1-crowd_mask[mask]) == 0:
continue
# take only the matching gt masks
current_masks = masks[mask, :, :].copy()
# take only matching gt boxes
current_boxes = boxes[mask, :].copy()
# take only matching gt labels
labels['labels'] = labels['labels'][mask]
current_labels = labels
# We now might have fewer crowd annotations
if num_crowds > 0:
labels['num_crowds'] = np.sum(crowd_mask[mask])
# should we use the box left and top corner or the crop's
current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
rect[:2])
# adjust to crop (by substracting crop's left,top)
current_boxes[:, :2] -= rect[:2]
current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
rect[2:])
# adjust to crop (by substracting crop's left,top)
current_boxes[:, 2:] -= rect[:2]
# crop the current masks to the same dimensions as the image
current_masks = current_masks[:, rect[1]:rect[3], rect[0]:rect[2]]
return current_image, current_masks, current_boxes, current_labels
class Expand(object):
def __init__(self, mean):
self.mean = mean
def __call__(self, image, masks, boxes, labels):
if random.randint(2):
return image, masks, boxes, labels
height, width, depth = image.shape
ratio = random.uniform(1, 4)
left = random.uniform(0, width*ratio - width)
top = random.uniform(0, height*ratio - height)
expand_image = np.zeros(
(int(height*ratio), int(width*ratio), depth),
dtype=image.dtype)
expand_image[:, :, :] = self.mean
expand_image[int(top):int(top + height),
int(left):int(left + width)] = image
image = expand_image
expand_masks = np.zeros(
(masks.shape[0], int(height*ratio), int(width*ratio)),
dtype=masks.dtype)
expand_masks[:,int(top):int(top + height),
int(left):int(left + width)] = masks
masks = expand_masks
boxes = boxes.copy()
boxes[:, :2] += (int(left), int(top))
boxes[:, 2:] += (int(left), int(top))
return image, masks, boxes, labels
class RandomMirror(object):
def __call__(self, image, masks, boxes, labels):
_, width, _ = image.shape
if random.randint(2):
image = image[:, ::-1]
masks = masks[:, :, ::-1]
boxes = boxes.copy()
boxes[:, 0::2] = width - boxes[:, 2::-2]
return image, masks, boxes, labels
class RandomFlip(object):
def __call__(self, image, masks, boxes, labels):
height , _ , _ = image.shape
if random.randint(2):
image = image[::-1, :]
masks = masks[:, ::-1, :]
boxes = boxes.copy()
boxes[:, 1::2] = height - boxes[:, 3::-2]
return image, masks, boxes, labels
class RandomRot90(object):
def __call__(self, image, masks, boxes, labels):
old_height , old_width , _ = image.shape
k = random.randint(4)
image = np.rot90(image,k)
masks = np.array([np.rot90(mask,k) for mask in masks])
boxes = boxes.copy()
for _ in range(k):
boxes = np.array([[box[1], old_width - 1 - box[2], box[3], old_width - 1 - box[0]] for box in boxes])
old_width, old_height = old_height, old_width
return image, masks, boxes, labels
class SwapChannels(object):
"""Transforms a tensorized image by swapping the channels in the order
specified in the swap tuple.
Args:
swaps (int triple): final order of channels
eg: (2, 1, 0)
"""
def __init__(self, swaps):
self.swaps = swaps
def __call__(self, image):
"""
Args:
image (Tensor): image tensor to be transformed
Return:
a tensor with channels swapped according to swap
"""
# if torch.is_tensor(image):
# image = image.data.cpu().numpy()
# else:
# image = np.array(image)
image = image[:, :, self.swaps]
return image
class PhotometricDistort(object):
def __init__(self):
self.pd = [
RandomContrast(),
ConvertColor(transform='HSV'),
RandomSaturation(),
RandomHue(),
ConvertColor(current='HSV', transform='BGR'),
RandomContrast()
]
self.rand_brightness = RandomBrightness()
self.rand_light_noise = RandomLightingNoise()
def __call__(self, image, masks, boxes, labels):
im = image.copy()
im, masks, boxes, labels = self.rand_brightness(im, masks, boxes, labels)
if random.randint(2):
distort = Compose(self.pd[:-1])
else:
distort = Compose(self.pd[1:])
im, masks, boxes, labels = distort(im, masks, boxes, labels)
return self.rand_light_noise(im, masks, boxes, labels)
class PrepareMasks(object):
"""
Prepares the gt masks for use_gt_bboxes by cropping with the gt box
and downsampling the resulting mask to mask_size, mask_size. This
function doesn't do anything if cfg.use_gt_bboxes is False.
"""
def __init__(self, mask_size, use_gt_bboxes):
self.mask_size = mask_size
self.use_gt_bboxes = use_gt_bboxes
def __call__(self, image, masks, boxes, labels=None):
if not self.use_gt_bboxes:
return image, masks, boxes, labels
height, width, _ = image.shape
new_masks = np.zeros((masks.shape[0], self.mask_size ** 2))
for i in range(len(masks)):
x1, y1, x2, y2 = boxes[i, :]
x1 *= width
x2 *= width
y1 *= height
y2 *= height
x1, y1, x2, y2 = (int(x1), int(y1), int(x2), int(y2))
# +1 So that if y1=10.6 and y2=10.9 we still have a bounding box
cropped_mask = masks[i, y1:(y2+1), x1:(x2+1)]
scaled_mask = cv2.resize(cropped_mask, (self.mask_size, self.mask_size))
new_masks[i, :] = scaled_mask.reshape(1, -1)
# Binarize
new_masks[new_masks > 0.5] = 1
new_masks[new_masks <= 0.5] = 0
return image, new_masks, boxes, labels
class BackboneTransform(object):
"""
Transforms a BRG image made of floats in the range [0, 255] to whatever
input the current backbone network needs.
transform is a transform config object (see config.py).
in_channel_order is probably 'BGR' but you do you, kid.
"""
def __init__(self, transform, mean, std, in_channel_order):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.transform = transform
# Here I use "Algorithms and Coding" to convert string permutations to numbers
self.channel_map = {c: idx for idx, c in enumerate(in_channel_order)}
self.channel_permutation = [self.channel_map[c] for c in transform.channel_order]
def __call__(self, img, masks=None, boxes=None, labels=None):
img = img.astype(np.float32)
if self.transform.normalize:
img = (img - self.mean) / self.std
elif self.transform.subtract_means:
img = (img - self.mean)
elif self.transform.to_float:
img = img / 255
img = img[:, :, self.channel_permutation]
return img.astype(np.float32), masks, boxes, labels
class BaseTransform(object):
""" Transorm to be used when evaluating. """
def __init__(self, mean=MEANS, std=STD):
self.augment = Compose([
ConvertFromInts(),
Resize(resize_gt=False),
BackboneTransform(cfg.backbone.transform, mean, std, 'BGR')
])
def __call__(self, img, masks=None, boxes=None, labels=None):
return self.augment(img, masks, boxes, labels)
import torch.nn.functional as F
class FastBaseTransform(torch.nn.Module):
"""
Transform that does all operations on the GPU for super speed.
This doesn't suppport a lot of config settings and should only be used for production.
Maintain this as necessary.
"""
def __init__(self):
super().__init__()
self.mean = torch.Tensor(MEANS).float().cuda()[None, :, None, None]
self.std = torch.Tensor( STD ).float().cuda()[None, :, None, None]
self.transform = cfg.backbone.transform
def forward(self, img):
self.mean = self.mean.to(img.device)
self.std = self.std.to(img.device)
# img assumed to be a pytorch BGR image with channel order [n, h, w, c]
if cfg.preserve_aspect_ratio:
_, h, w, _ = img.size()
img_size = Resize.calc_size_preserve_ar(w, h, cfg.max_size)
img_size = (img_size[1], img_size[0]) # Pytorch needs h, w
else:
img_size = (cfg.max_size, cfg.max_size)
img = img.permute(0, 3, 1, 2).contiguous()
img = F.interpolate(img, img_size, mode='bilinear', align_corners=False)
if self.transform.normalize:
img = (img - self.mean) / self.std
elif self.transform.subtract_means:
img = (img - self.mean)
elif self.transform.to_float:
img = img / 255
if self.transform.channel_order != 'RGB':
raise NotImplementedError
img = img[:, (2, 1, 0), :, :].contiguous()
# Return value is in channel order [n, c, h, w] and RGB
return img
def do_nothing(img=None, masks=None, boxes=None, labels=None):
return img, masks, boxes, labels
def enable_if(condition, obj):
return obj if condition else do_nothing
class SSDAugmentation(object):
""" Transform to be used when training. """
def __init__(self, mean=MEANS, std=STD):
self.augment = Compose([
ConvertFromInts(),
ToAbsoluteCoords(),
enable_if(cfg.augment_photometric_distort, PhotometricDistort()),
enable_if(cfg.augment_expand, Expand(mean)),
enable_if(cfg.augment_random_sample_crop, RandomSampleCrop()),
enable_if(cfg.augment_random_mirror, RandomMirror()),
enable_if(cfg.augment_random_flip, RandomFlip()),
enable_if(cfg.augment_random_flip, RandomRot90()),
Resize(),
enable_if(not cfg.preserve_aspect_ratio, Pad(cfg.max_size, cfg.max_size, mean)),
ToPercentCoords(),
PrepareMasks(cfg.mask_size, cfg.use_gt_bboxes),
BackboneTransform(cfg.backbone.transform, mean, std, 'BGR')
])
def __call__(self, img, masks, boxes, labels):
return self.augment(img, masks, boxes, labels)
================================================
FILE: utils/cython_nms.pyx
================================================
## Note: Figure out the license details later.
#
# Based on:
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
cimport cython
import numpy as np
cimport numpy as np
cdef inline np.float32_t max(np.float32_t a, np.float32_t b) nogil:
return a if a >= b else b
cdef inline np.float32_t min(np.float32_t a, np.float32_t b) nogil:
return a if a <= b else b
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float32_t thresh):
cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
cdef np.ndarray[np.int64_t, ndim=1] order = scores.argsort()[::-1]
cdef int ndets = dets.shape[0]
cdef np.ndarray[np.int_t, ndim=1] suppressed = \
np.zeros((ndets), dtype=np.int)
# nominal indices
cdef int _i, _j
# sorted indices
cdef int i, j
# temp variables for box i's (the box currently under consideration)
cdef np.float32_t ix1, iy1, ix2, iy2, iarea
# variables for computing overlap with box j (lower scoring box)
cdef np.float32_t xx1, yy1, xx2, yy2
cdef np.float32_t w, h
cdef np.float32_t inter, ovr
with nogil:
for _i in range(ndets):
i = order[_i]
if suppressed[i] == 1:
continue
ix1 = x1[i]
iy1 = y1[i]
ix2 = x2[i]
iy2 = y2[i]
iarea = areas[i]
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
xx1 = max(ix1, x1[j])
yy1 = max(iy1, y1[j])
xx2 = min(ix2, x2[j])
yy2 = min(iy2, y2[j])
w = max(0.0, xx2 - xx1 + 1)
h = max(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (iarea + areas[j] - inter)
if ovr >= thresh:
suppressed[j] = 1
return np.where(suppressed == 0)[0]
================================================
FILE: utils/functions.py
================================================
import torch
import torch.nn as nn
import os
import math
from collections import deque
from pathlib import Path
from layers.interpolate import InterpolateModule
class MovingAverage():
""" Keeps an average window of the specified number of items. """
def __init__(self, max_window_size=1000):
self.max_window_size = max_window_size
self.reset()
def add(self, elem):
""" Adds an element to the window, removing the earliest element if necessary. """
if not math.isfinite(elem):
print('Warning: Moving average ignored a value of %f' % elem)
return
self.window.append(elem)
self.sum += elem
if len(self.window) > self.max_window_size:
self.sum -= self.window.popleft()
def append(self, elem):
""" Same as add just more pythonic. """
self.add(elem)
def reset(self):
""" Resets the MovingAverage to its initial state. """
self.window = deque()
self.sum = 0
def get_avg(self):
""" Returns the average of the elements in the window. """
return self.sum / max(len(self.window), 1)
def __str__(self):
return str(self.get_avg())
def __repr__(self):
return repr(self.get_avg())
def __len__(self):
return len(self.window)
class ProgressBar():
""" A simple progress bar that just outputs a string. """
def __init__(self, length, max_val):
self.max_val = max_val
self.length = length
self.cur_val = 0
self.cur_num_bars = -1
self._update_str()
def set_val(self, new_val):
self.cur_val = new_val
if self.cur_val > self.max_val:
self.cur_val = self.max_val
if self.cur_val < 0:
self.cur_val = 0
self._update_str()
def is_finished(self):
return self.cur_val == self.max_val
def _update_str(self):
num_bars = int(self.length * (self.cur_val / self.max_val))
if num_bars != self.cur_num_bars:
self.cur_num_bars = num_bars
self.string = '█' * num_bars + '░' * (self.length - num_bars)
def __repr__(self):
return self.string
def __str__(self):
return self.string
def init_console():
"""
Initialize the console to be able to use ANSI escape characters on Windows.
"""
if os.name == 'nt':
from colorama import init
init()
class SavePath:
"""
Why is this a class?
Why do I have a class for creating and parsing save paths?
What am I doing with my life?
"""
def __init__(self, model_name:str, epoch:int, iteration:int):
self.model_name = model_name
self.epoch = epoch
self.iteration = iteration
def get_path(self, root:str=''):
file_name = self.model_name + '_' + str(self.epoch) + '_' + str(self.iteration) + '.pth'
return os.path.join(root, file_name)
@staticmethod
def from_str(path:str):
file_name = os.path.basename(path)
if file_name.endswith('.pth'):
file_name = file_name[:-4]
params = file_name.split('_')
if file_name.endswith('interrupt'):
params = params[:-1]
model_name = '_'.join(params[:-2])
epoch = params[-2]
iteration = params[-1]
return SavePath(model_name, int(epoch), int(iteration))
@staticmethod
def remove_interrupt(save_folder):
for p in Path(save_folder).glob('*_interrupt.pth'):
p.unlink()
@staticmethod
def get_interrupt(save_folder):
for p in Path(save_folder).glob('*_interrupt.pth'):
return str(p)
return None
@staticmethod
def get_latest(save_folder, config):
""" Note: config should be config.name. """
max_iter = -1
max_name = None
for p in Path(save_folder).glob(config + '_*'):
path_name = str(p)
try:
save = SavePath.from_str(path_name)
except:
continue
if save.model_name == config and save.iteration > max_iter:
max_iter = save.iteration
max_name = path_name
return max_name
def make_net(in_channels, conf, include_last_relu=True):
"""
A helper function to take a config setting and turn it into a network.
Used by protonet and extrahead. Returns (network, out_channels)
"""
def make_layer(layer_cfg):
nonlocal in_channels
# Possible patterns:
# ( 256, 3, {}) -> conv
# ( 256,-2, {}) -> deconv
# (None,-2, {}) -> bilinear interpolate
# ('cat',[],{}) -> concat the subnetworks in the list
#
# You know it would have probably been simpler just to adopt a 'c' 'd' 'u' naming scheme.
# Whatever, it's too late now.
if isinstance(layer_cfg[0], str):
layer_name = layer_cfg[0]
if layer_name == 'cat':
nets = [make_net(in_channels, x) for x in layer_cfg[1]]
layer = Concat([net[0] for net in nets], layer_cfg[2])
num_channels = sum([net[1] for net in nets])
else:
num_channels = layer_cfg[0]
kernel_size = layer_cfg[1]
if kernel_size > 0:
layer = nn.Conv2d(in_channels, num_channels, kernel_size, **layer_cfg[2])
else:
if num_channels is None:
layer = InterpolateModule(scale_factor=-kernel_size, mode='bilinear', align_corners=False, **layer_cfg[2])
else:
layer = nn.ConvTranspose2d(in_channels, num_channels, -kernel_size, **layer_cfg[2])
in_channels = num_channels if num_channels is not None else in_channels
# Don't return a ReLU layer if we're doing an upsample. This probably doesn't affect anything
# output-wise, but there's no need to go through a ReLU here.
# Commented out for backwards compatibility with previous models
# if num_channels is None:
# return [layer]
# else:
return [layer, nn.ReLU(inplace=True)]
# Use sum to concat together all the component layer lists
net = sum([make_layer(x) for x in conf], [])
if not include_last_relu:
net = net[:-1]
return nn.Sequential(*(net)), in_channels
================================================
FILE: utils/logger.py
================================================
import os
import json
import time
import sys
from typing import Union
import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
# Because Python's package heierarchy system sucks
if __name__ == '__main__':
from nvinfo import gpu_info, visible_gpus, nvsmi_available
from functions import MovingAverage
else:
from .nvinfo import gpu_info, visible_gpus, nvsmi_available
from .functions import MovingAverage
class Log:
"""
A class to log information during training per information and save it out.
It also can include extra debug information like GPU usage / temp automatically.
Extra args:
- session_data: If you have any data unique to this session, put it here.
- overwrite: Whether or not to overwrite a pre-existing log with this name.
- log_gpu_stats: Whether or not to log gpu information like temp, usage, memory.
Note that this requires nvidia-smi to be present in your PATH.
- log_time: Also log the time in each iteration.
"""
def __init__(self, log_name:str, log_dir:str='logs/', session_data:dict={},
overwrite:bool=False, log_gpu_stats:bool=True, log_time:bool=True):
if log_gpu_stats and not nvsmi_available():
print('Warning: Log created with log_gpu_stats=True, but nvidia-smi ' \
'was not found. Setting log_gpu_stats to False.')
log_gpu_stats = False
if not os.path.exists(log_dir):
os.makedirs(log_dir)
self.log_path = os.path.join(log_dir, log_name + '.log')
# if os.path.exists(self.log_path) and overwrite:
# os.unlink(self.log_path)
if os.path.exists(self.log_path):
# Log already exists, so we're going to add to it. Increment the session counter.
with open(self.log_path, 'r') as f:
for last in f: pass
if len(last) > 1:
self.session = json.loads(last)['session'] + 1
else:
self.session = 0
else:
self.session = 0
self.log_gpu_stats = log_gpu_stats
self.log_time = log_time
if self.log_gpu_stats:
self.visible_gpus = visible_gpus()
self._log_session_header(session_data)
def _log_session_header(self, session_data:dict):
"""
Log information that does not change between iterations here.
This is to cut down on the file size so you're not outputing this every iteration.
"""
info = {}
info['type'] = 'session'
info['session'] = self.session
info['data'] = session_data
if self.log_gpu_stats:
keys = ['idx', 'name', 'uuid', 'pwr_cap', 'mem_total']
gpus = gpu_info()
info['gpus'] = [{k: gpus[i][k] for k in keys} for i in self.visible_gpus]
if self.log_time:
info['time'] = time.time()
out = json.dumps(info) + '\n'
with open(self.log_path, 'a') as f:
f.write(out)
def log(self, type:str, data:dict={}, **kwdargs):
"""
Add an iteration to the log with the specified data points.
Type should be the type of information this is (e.g., train, valid, etc.)
You can either pass data points as kwdargs, or as a dictionary (or both!).
Values should be json-serializable.
"""
info = {}
info['type'] = type
info['session'] = self.session
kwdargs.update(data)
info['data'] = kwdargs
if self.log_gpu_stats:
keys = ['fan_spd', 'temp', 'pwr_used', 'mem_used', 'util']
gpus = gpu_info()
info['gpus'] = [{k: gpus[i][k] for k in keys} for i in self.visible_gpus]
if self.log_time:
info['time'] = time.time()
out = json.dumps(info) + '\n'
with open(self.log_path, 'a') as f:
f.write(out)
class LogEntry():
""" A class that allows you to navigate a dictonary using x.a.b[2].c, etc. """
def __init__(self, entry:Union[dict, list]):
self._ = entry
def __getattr__(self, name):
if name == '_':
return self.__dict__['_']
res = self.__dict__['_'][name]
if type(res) == dict or type(res) == list:
return LogEntry(res)
else:
return res
def __getitem__(self, name):
return self.__getattr__(name)
def __len__(self):
return len(self.__dict__['_'])
class LogVisualizer():
COLORS = [
'xkcd:azure',
'xkcd:coral',
'xkcd:turquoise',
'xkcd:orchid',
'xkcd:orange',
'xkcd:blue',
'xkcd:red',
'xkcd:teal',
'xkcd:magenta',
'xkcd:orangered'
]
def __init__(self):
self.logs = []
self.total_logs = []
self.log_names = []
def _decode(self, query:str) -> list:
path, select = (query.split(';') + [''])[:2]
if select.strip() == '':
select = lambda x, s: True
else:
select = eval('lambda x, s: ' + select)
if path.strip() == '':
path = lambda x, s: x
else:
path = eval('lambda x, s: ' + path)
return path, select
def _follow(self, entry:LogEntry, query:list):
path, select = query
try:
if select(entry, entry._s):
res = path(entry, entry._s)
if type(res) == LogEntry:
return res.__dict__['_']
else:
return res
else:
return None
except (KeyError, IndexError):
return None
def _color(self, idx:int):
return self.COLORS[idx % len(self.COLORS)]
def sessions(self, path:str):
""" Prints statistics about the sessions in the file. """
if not os.path.exists(path):
print(path + ' doesn\'t exist!')
return
cur_session = None
cur_time = 0
last_time = 0
num_entries = 0
def pop_session():
delta = last_time - cur_time
time_str = str(datetime.timedelta(seconds=delta)).split('.')[0]
print('Session % 3d: % 8d entries | %s elapsed' % (cur_session, num_entries, time_str))
with open(path, 'r') as f:
for line in f:
line = line.strip()
if len(line) > 0:
js = json.loads(line)
if js['type'] == 'session':
if cur_session is not None:
pop_session()
cur_time = js['time']
cur_session = js['session']
num_entries = 0
last_time = js['time']
num_entries += 1
pop_session()
def add(self, path:str, session:Union[int,list]=None):
""" Add a log file to the list of logs being considered. """
log = defaultdict(lambda: [])
total_log = []
if not os.path.exists(path):
print(path + ' doesn\'t exist!')
return
session_idx = 0
ignoring = True
def valid(idx):
if session is None:
return True
elif type(session) == int:
return (idx == session)
else:
return idx in session
with open(path, 'r') as f:
for line in f:
line = line.strip()
if len(line) > 0:
js = json.loads(line)
_type = js['type']
if _type == 'session':
session_idx = js['session']
ignoring = not valid(session_idx)
if not ignoring:
ljs = LogEntry(js)
if _type == 'session':
js['_s'] = ljs
else:
js['_s'] =log['session'][-1]
log[_type].append(ljs)
total_log.append(ljs)
name = os.path.basename(path)
if session is not None:
name += ' (Session %s)' % session
self.logs.append(log)
self.total_logs.append(total_log)
self.log_names.append(name)
def query(self, x:Union[str, list], entry_type:str=None, x_idx:int=None, log_idx:int=None) -> list:
"""
Given a query string (can be already decoded for faster computation), query the entire log
and return all values found by that query. If both log_idx and x_idx is None, this will be
a list of lists in the form [log_idx][result_idx]. If x_idx is not None, then the result
will be a list of [log_idx]. If both are not none, the return value will be a single query
return value. With entry_type=None, this will search the entire log.
"""
if type(x) is not list:
x = self._decode(x)
res = []
for idx in (range(len(self.logs)) if log_idx is None else [log_idx]):
candidates = []
log = self.total_logs[idx] if entry_type is None else self.logs[idx][entry_type]
for entry in log:
candidate = self._follow(entry, x)
if candidate is not None:
candidates.append(candidate)
if x_idx is not None:
candidates = candidates[x_idx]
res.append(candidates)
if log_idx is not None:
res = res[0]
return res
def check(self, entry_type:str, x:str):
""" Checks the log for the valid keys for this input. """
keys = set()
x = self._decode(x)
for log in self.logs:
for datum in log[entry_type]:
res = self._follow(datum, x)
if type(res) == dict:
for key in res.keys():
keys.add(key)
elif type(res) == list:
keys.add('< %d' % len(res))
return list(keys)
def plot(self, entry_type:str, x:str, y:str, smoothness:int=0):
""" Plot sequential log data. """
query_x = self._decode(x)
query_y = self._decode(y)
for idx, (log, name) in enumerate(zip(self.logs, self.log_names)):
log = log[entry_type]
if smoothness > 1:
avg = MovingAverage(smoothness)
_x = []
_y = []
for datum in log:
val_x = self._follow(datum, query_x)
val_y = self._follow(datum, query_y)
if val_x is not None and val_y is not None:
if smoothness > 1:
avg.append(val_y)
val_y = avg.get_avg()
if len(avg) < smoothness // 10:
continue
_x.append(val_x)
_y.append(val_y)
plt.plot(_x, _y, color=self._color(idx), label=name)
plt.title(y.replace('x.', entry_type + '.'))
plt.legend()
plt.grid(linestyle=':', linewidth=0.5)
plt.show()
def bar(self, entry_type:str, x:str, labels:list=None, diff:bool=False, x_idx:int=-1):
""" Plot a bar chart. The result of x should be list or dictionary. """
query = self._decode(x)
data_points = []
for idx, (log, name) in enumerate(zip(self.logs, self.log_names)):
log = log[entry_type]
candidates = []
for entry in log:
test = self._follow(entry, query)
if type(test) == dict:
candidates.append(test)
elif type(test) == list:
candidates.append({idx: v for idx, v in enumerate(test)})
if len(candidates) > 0:
data_points.append((name, candidates[x_idx]))
if len(data_points) == 0:
print('Warning: Nothing to show in bar chart!')
return
names = [x[0] for x in data_points]
data_points = [x[1] for x in data_points]
# Construct the labels for the data
if labels is not None:
data_labels = labels
else:
data_labels = set()
for datum in data_points:
for k in datum:
data_labels.add(k)
data_labels = list(data_labels)
data_labels.sort()
data_values = [[(datum[k] if k in datum else None) for k in data_labels] for datum in data_points]
if diff:
for idx in reversed(range(len(data_values))):
for jdx in range(len(data_labels)):
if data_values[0][jdx] is None or data_values[idx][jdx] is None:
data_values[idx][jdx] = None
else:
data_values[idx][jdx] -= data_values[0][jdx]
series_labels = names
# Plot the graph now
num_bars = len(series_labels)
bar_width = 1 / (num_bars + 1)
# Set position of bar on X axis
positions = [np.arange(len(data_labels))]
for _ in range(1, num_bars):
positions.append([x + bar_width for x in positions[-1]])
# Make the plot
for idx, (series, data, pos) in enumerate(zip(series_labels, data_values, positions)):
plt.bar(pos, data, color=self._color(idx), width=bar_width, edgecolor='white', label=series)
# Add xticks on the middle of the group bars
plt.title(x.replace('x.', entry_type + '.') + (' diff' if diff else ''))
plt.xticks([r + bar_width for r in range(len(data_labels))], data_labels)
# Create legend & Show graphic
plt.legend()
plt.show()
def elapsed_time(self, cond1:str='', cond2:str='', legible:bool=True) -> list:
"""
Returns the elapsed time between two entries based on the given conditionals.
If a query isn't specified, the first / last entry will be used. The first query
uses the first value and the second query uses the last value in the results.
Setting legible to true returns human-readable results, while false returns seconds.
"""
q1 = 'x.time; ' + cond1
q2 = 'x.time; ' + cond2
x1 = self.query(q1, x_idx=0)
x2 = self.query(q2, x_idx=-1)
diff = (lambda x: str(datetime.timedelta(seconds=x)).split('.')[0]) if legible else lambda x: x
return [diff(b - a) for a, b in zip(x1, x2)]
if __name__ == '__main__':
if len(sys.argv) < 4+1:
print('Usage: python utils/logger.py ')
exit()
vis = LogVisualizer()
vis.add(sys.argv[1])
vis.plot(sys.argv[2], sys.argv[3], sys.argv[4])
================================================
FILE: utils/nvinfo.py
================================================
# My version of nvgpu because nvgpu didn't have all the information I was looking for.
import re
import subprocess
import shutil
import os
def gpu_info() -> list:
"""
Returns a dictionary of stats mined from nvidia-smi for each gpu in a list.
Adapted from nvgpu: https://pypi.org/project/nvgpu/, but mine has more info.
"""
gpus = [line for line in _run_cmd(['nvidia-smi', '-L']) if line]
gpu_infos = [re.match('GPU ([0-9]+): ([^(]+) \(UUID: ([^)]+)\)', gpu).groups() for gpu in gpus]
gpu_infos = [dict(zip(['idx', 'name', 'uuid'], info)) for info in gpu_infos]
gpu_count = len(gpus)
lines = _run_cmd(['nvidia-smi'])
selected_lines = lines[7:7 + 3 * gpu_count]
for i in range(gpu_count):
mem_used, mem_total = [int(m.strip().replace('MiB', '')) for m in
selected_lines[3 * i + 1].split('|')[2].strip().split('/')]
pw_tmp_info, mem_info, util_info = [x.strip() for x in selected_lines[3 * i + 1].split('|')[1:-1]]
pw_tmp_info = [x[:-1] for x in pw_tmp_info.split(' ') if len(x) > 0]
fan_speed, temperature, pwr_used, pwr_cap = [int(pw_tmp_info[i]) for i in (0, 1, 3, 5)]
gpu_infos[i]['fan_spd' ] = fan_speed
gpu_infos[i]['temp' ] = temperature
gpu_infos[i]['pwr_used'] = pwr_used
gpu_infos[i]['pwr_cap' ] = pwr_cap
mem_used, mem_total = [int(x) for x in mem_info.replace('MiB', '').split(' / ')]
gpu_infos[i]['mem_used' ] = mem_used
gpu_infos[i]['mem_total'] = mem_total
utilization = int(util_info.split(' ')[0][:-1])
gpu_infos[i]['util'] = utilization
gpu_infos[i]['idx'] = int(gpu_infos[i]['idx'])
return gpu_infos
def nvsmi_available() -> bool:
""" Returns whether or not nvidia-smi is present in this system's PATH. """
return shutil.which('nvidia-smi') is not None
def visible_gpus() -> list:
""" Returns a list of the indexes of all the gpus visible to pytorch. """
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
return list(range(len(gpu_info())))
else:
return [int(x.strip()) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')]
def _run_cmd(cmd:list) -> list:
""" Runs a command and returns a list of output lines. """
output = subprocess.check_output(cmd)
output = output.decode('UTF-8')
return output.split('\n')
================================================
FILE: utils/timer.py
================================================
import time
from collections import defaultdict
_total_times = defaultdict(lambda: 0)
_start_times = defaultdict(lambda: -1)
_disabled_names = set()
_timer_stack = []
_running_timer = None
_disable_all = False
def disable_all():
global _disable_all
_disable_all = True
def enable_all():
global _disable_all
_disable_all = False
def disable(fn_name):
""" Disables the given function name fom being considered for the average or outputted in print_stats. """
_disabled_names.add(fn_name)
def enable(fn_name):
""" Enables function names disabled by disable. """
_disabled_names.remove(fn_name)
def reset():
""" Resets the current timer. Call this at the start of an iteration. """
global _running_timer
_total_times.clear()
_start_times.clear()
_timer_stack.clear()
_running_timer = None
def start(fn_name, use_stack=True):
"""
Start timing the specific function.
Note: If use_stack is True, only one timer can be active at a time.
Once you stop this timer, the previous one will start again.
"""
global _running_timer, _disable_all
if _disable_all:
return
if use_stack:
if _running_timer is not None:
stop(_running_timer, use_stack=False)
_timer_stack.append(_running_timer)
start(fn_name, use_stack=False)
_running_timer = fn_name
else:
_start_times[fn_name] = time.perf_counter()
def stop(fn_name=None, use_stack=True):
"""
If use_stack is True, this will stop the currently running timer and restore
the previous timer on the stack if that exists. Note if use_stack is True,
fn_name will be ignored.
If use_stack is False, this will just stop timing the timer fn_name.
"""
global _running_timer, _disable_all
if _disable_all:
return
if use_stack:
if _running_timer is not None:
stop(_running_timer, use_stack=False)
if len(_timer_stack) > 0:
_running_timer = _timer_stack.pop()
start(_running_timer, use_stack=False)
else:
_running_timer = None
else:
print('Warning: timer stopped with no timer running!')
else:
if _start_times[fn_name] > -1:
_total_times[fn_name] += time.perf_counter() - _start_times[fn_name]
else:
print('Warning: timer for %s stopped before starting!' % fn_name)
def print_stats():
""" Prints the current timing information into a table. """
print()
all_fn_names = [k for k in _total_times.keys() if k not in _disabled_names]
max_name_width = max([len(k) for k in all_fn_names] + [4])
if max_name_width % 2 == 1: max_name_width += 1
format_str = ' {:>%d} | {:>10.4f} ' % max_name_width
header = (' {:^%d} | {:^10} ' % max_name_width).format('Name', 'Time (ms)')
print(header)
sep_idx = header.find('|')
sep_text = ('-' * sep_idx) + '+' + '-' * (len(header)-sep_idx-1)
print(sep_text)
for name in all_fn_names:
print(format_str.format(name, _total_times[name]*1000))
print(sep_text)
print(format_str.format('Total', total_time()*1000))
print()
def total_time():
""" Returns the total amount accumulated across all functions in seconds. """
return sum([elapsed_time for name, elapsed_time in _total_times.items() if name not in _disabled_names])
class env():
"""
A class that lets you go:
with timer.env(fn_name):
# (...)
That automatically manages a timer start and stop for you.
"""
def __init__(self, fn_name, use_stack=True):
self.fn_name = fn_name
self.use_stack = use_stack
def __enter__(self):
start(self.fn_name, use_stack=self.use_stack)
def __exit__(self, e, ev, t):
stop(self.fn_name, use_stack=self.use_stack)
================================================
FILE: web/css/index.css
================================================
/*
Pallete:
FFFFFF
D2CBCB
7D8491
003459
274C77
161925
*/
* { box-sizing: border-box; }
.big {
font-size:72px;
margin-bottom: 20px;
}
.list_wrapper {
width: 500px;
padding-top: 2px;
padding-bottom: 20px;
}
body {
margin:0;
padding:0;
vertical-align: top;
background-color: #274C77;
color: #ffffff;
font-family: 'Open Sans', sans-serif;
font-size: 24px;
width: 100%;
height: 99vh;
display: grid;
grid-template-areas:
'header'
'main'
'footer';
grid-template-rows: 100px auto 25px;
text-align: center;
}
.box {
background-color: #23395B;
border-radius: 10px;
}
.header { grid-area: header; }
.main { grid-area: main; }
.footer { grid-area: footer; }
span {
margin:0;
padding:0;
vertical-align: top;
}
================================================
FILE: web/css/list.css
================================================
ul {
list-style-type: none;
margin: 0;
padding: 0;
}
li {
/* font: 200 24px/1.5 Helvetica, Verdana, sans-serif; */
font-size: 22px;
}
li a {
text-decoration: none;
color: #fff;
display: block;
width: 100%;
-webkit-transition: font-size 0.2s ease, background-color 0.2s ease;
-moz-transition: font-size 0.2s ease, background-color 0.2s ease;
-o-transition: font-size 0.2s ease, background-color 0.2s ease;
-ms-transition: font-size 0.2s ease, background-color 0.2s ease;
transition: font-size 0.2s ease, background-color 0.2s ease;
}
li a:hover {
font-size: 30px;
background: rgb(95, 138, 219);
}
================================================
FILE: web/css/toggle.css
================================================
.switch {
position: relative;
top: 5;
}
.switch input {display:none;}
.slider {
position: relative;
display: inline-block;
width: 60px;
height: 26px;
cursor: pointer;
top: 0;
left: 0;
right: 0;
bottom: 0;
background-color: #ccc;
-webkit-transition: .4s;
transition: .4s;
}
.slider:before {
position: absolute;
content: "";
height: 20px;
width: 20px;
left: 3px;
bottom: 3px;
background-color: white;
-webkit-transition: .4s;
transition: .4s;
}
input:checked + .slider {
background-color: #2196F3;
}
input:focus + .slider {
box-shadow: 0 0 1px #2196F3;
}
input:checked + .slider:before {
-webkit-transform: translateX(34px);
-ms-transform: translateX(34px);
transform: translateX(34px);
}
/* Rounded sliders */
.slider.round {
border-radius: 34px;
}
.slider.round:before {
border-radius: 50%;
}
================================================
FILE: web/css/viewer.css
================================================
.info { grid-area: info; }
.image { grid-area: image; }
.controls { grid-area: controls; }
#viewer {
display: grid;
grid-template-areas: 'info image controls';
grid-template-columns: 1fr 2fr 1fr;
grid-gap: 0;
}
#viewer > div.box {
padding: 10px;
margin: 0 10px 10px 10px;
}
.image_box {
display: grid;
grid-template-rows: max-content auto;
grid-gap: 10px;
}
#image_idx, #config_name, .info_value {
color: rgb(152, 160, 175);
}
.info_section {
text-align: center;
border-bottom: 1px solid #fff;
}
a {
text-decoration: none;
color: #fff;
}
a:hover {
color: rgb(152, 160, 175);
}
.setting {
display: grid;
grid-template-areas: 'label value input';
grid-template-columns: max-content 30px 1fr;
grid-gap: 20px;
padding: 0 10px 0 10px;
text-align: left;
}
.setting_label { grid-area: label; }
.setting_input {
grid-area: input;
}
.setting_value {
grid-area: value;
color: rgb(152, 160, 175);
}
.box_title {
width: 100%;
border-bottom: 1px solid #fff;
}
================================================
FILE: web/dets/ssd300.json
================================================
{"info": {"Cross-Class NMS": {"BBox mAP": 22.6,"Mask mAP": 13.5},"Per-Class NMS": {"BBox mAP": 23.9,"Mask mAP": 14.1},"Config": {"preserve_aspect_ratio": false, "use_prediction_module": false, "use_yolo_regressors": false, "use_prediction_matching": false}}, "images": [{"image_id": 74, "dets": [{"score": 0.9325719475746155, "bbox": [56.2, 277.7, 308.8, 103.1], "category": "dog", "mask": {"size": [426, 640], "counts": "Y[P18P=6K4K3N2N2N2N2M2O2N2O0O2N1O2O0O2O0O101N10000O2O0O101N10000O2O000O2O0O10000O100O10001N10000O1000000O101O000O1000000O2O00000O100000000O2O000000000O10001O00000O100000001O000O1000000000000O2O0000000000000O101O000000000000000O101O000000000000000000000000001N100000000000001O000000000000001O0000000000000000000000000000000000001O00000000000000001O0000000001O0000000000000000000O1010O000000000000000001O0000000000001O00001O00001N1000001O0000001O00001O00001O001O00001O00001O001O001O0O2O1O001O1O0O2O1O001O1O0O2O001N2O2N1N2O1N2O1N2N2O1N3M2O1N3M3L8G6HjTi3"}}, {"score": 0.9244362115859985, "bbox": [0.0, 14.7, 156.8, 309.7], "category": "bicycle", "mask": {"size": [426, 640], "counts": "Y2o5[700000O2M4M3M3M3L4M3M3M3L4M3M3L3N2M3N2O000O1000001O0O100O100O100O10000000000000000001N101N101N100O2N2O0O2O1N1O2O1N1O2O1N3M2N2O1N2N2N2N2O2M2O1O2M2O1O1N2O2N1O1O1N3N1O1O1O1O1N2O1O001O1O00100O1O1O1O11N2O001N2O001O00N2O0O2O1N2O0O2O1O2O0O2O1N1O2O0N3M3M2O2M4L3M4L4K4L5K5J6J5K6I7J9D?@f0UOgj_6"}}, {"score": 0.46240681409835815, "bbox": [460.5, 104.3, 33.6, 46.3], "category": "person", "mask": {"size": [426, 640], "counts": "[WS6b0e<6K3M2O2M200O1O100001O0O2O1N2N3L4I9CPTo1"}}, {"score": 0.34754881262779236, "bbox": [283.5, 99.3, 21.8, 54.8], "category": "person", "mask": {"size": [426, 640], "counts": "cmg3>i<9H6J5K3N2O01N2N3N2K5Hi<9H4M2O02M2N4Imhn3"}}, {"score": 0.21550069749355316, "bbox": [274.8, 100.5, 19.4, 51.0], "category": "person", "mask": {"size": [426, 640], "counts": "Scd3C9H6L3L3N2O001O0000O2O2N2M3M4K6H9F`0WOUQn4"}}, {"score": 0.08580822497606277, "bbox": [4.3, 141.6, 118.5, 117.7], "category": "bicycle", "mask": {"size": [426, 640], "counts": "_j1Y2Q;00000O101O00000O2O000O101N1000000001O00O010000O100O100O1O1O1O1O1O100O1O101N1O2O0O1O2O0O101N101O001N1O2N1O2O0O2O0O1N3N1O1O2N101O0O101O001O01O1O10O00101N101O000O1N2O1O0O101O10O02O000O3L3M2N1O1O2Nkco6"}}, {"score": 0.08575142174959183, "bbox": [344.1, 91.5, 22.0, 37.9], "category": "person", "mask": {"size": [426, 640], "counts": "PXZ8"}}, {"score": 0.082521952688694, "bbox": [301.3, 94.7, 17.9, 37.6], "category": "person", "mask": {"size": [426, 640], "counts": "PXZ8"}}, {"score": 0.08142534643411636, "bbox": [284.8, 95.2, 19.2, 28.7], "category": "person", "mask": {"size": [426, 640], "counts": "PXZ8"}}, {"score": 0.07999005168676376, "bbox": [316.0, 123.6, 20.4, 30.7], "category": "person", "mask": {"size": [426, 640], "counts": "PXZ8"}}, {"score": 0.07951077073812485, "bbox": [348.1, 120.8, 19.9, 31.0], "category": "person", "mask": {"size": [426, 640], "counts": "PXZ8"}}, {"score": 0.07927089929580688, "bbox": [245.6, 117.6, 23.5, 25.0], "category": "car", "mask": {"size": [426, 640], "counts": "PXZ8"}}, {"score": 0.07882924377918243, "bbox": [374.3, 99.4, 15.9, 41.2], "category": "person", "mask": {"size": [426, 640], "counts": "Pim4C=C6K3N2M3O0O10O2O1O1N2N4J5H9C?CQkP3"}}, {"score": 0.22430764138698578, "bbox": [256.6, 382.7, 42.6, 103.0], "category": "fire hydrant", "mask": {"size": [500, 333], "counts": "[jP4m0b>d0]O>C7I5L3M4L3N1O2N1O1O2O0000000001O1O1N2O1N3N2M4L3L6J5J:]Og0lNidb0"}}, {"score": 0.1781381219625473, "bbox": [23.6, 352.0, 28.9, 47.1], "category": "bench", "mask": {"size": [500, 333], "counts": "Za=9X?8I5L3M3M2O0O2O001O000000000001N1O2N2M3M5JWZZ4"}}, {"score": 0.1202094629406929, "bbox": [122.1, 317.6, 19.4, 58.2], "category": "person", "mask": {"size": [500, 333], "counts": "Vkm1`0P?=D7J4M1O11N2N2M5I7F[\\o2"}}, {"score": 0.1143656000494957, "bbox": [185.8, 253.1, 64.0, 135.0], "category": "potted plant", "mask": {"size": [500, 333], "counts": "i_R31`?5H9I7K5K3L3N2M4M4J5M300O001N1O2N10000001O1O2N1O2N1O2N1O2M3N1N3N1N4M2M2N4Lka]1"}}, {"score": 0.10406192392110825, "bbox": [154.1, 229.7, 66.2, 110.9], "category": "potted plant", "mask": {"size": [500, 333], "counts": "jeg23`?2N3EKRA7i>MXA3d>0\\A0c>1^AOc>0[A0g>0VA2R?43N3M000O2N10OOccR2"}}, {"score": 0.10101788491010666, "bbox": [134.0, 320.5, 8.1, 23.2], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.10056286305189133, "bbox": [74.1, 320.6, 49.0, 68.0], "category": "person", "mask": {"size": [500, 333], "counts": "`[Y1?P?9I5L3M2N3N1O1O2N1O1O1O10O0100000O01000O2O0O2O0N3N2M3M3L5K7IfSZ3"}}, {"score": 0.09888952225446701, "bbox": [135.5, 323.7, 8.3, 23.8], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.09524021297693253, "bbox": [130.0, 319.1, 11.2, 32.8], "category": "person", "mask": {"size": [500, 333], "counts": "fWR25Y[P3"}}, {"score": 0.09091795980930328, "bbox": [165.4, 326.7, 10.6, 9.5], "category": "car", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.09052538126707077, "bbox": [111.6, 235.2, 75.5, 95.7], "category": "potted plant", "mask": {"size": [500, 333], "counts": "`WP25T?=M3N2M1O2M2N2O2O00000001O0000000000O20N2O1N100O2K4O2N3N5IUgc2"}}, {"score": 0.08909637480974197, "bbox": [244.2, 83.4, 44.1, 82.9], "category": "potted plant", "mask": {"size": [500, 333], "counts": "oUj38R?f0@9H7J6J3N3L3O1N2N2N100O2O000000000001O001O0O3M2O1N3M2M4L5J6I9EmYh0"}}, {"score": 0.08896711468696594, "bbox": [169.5, 325.2, 13.4, 12.9], "category": "car", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.0880962461233139, "bbox": [86.0, 333.2, 26.6, 45.1], "category": "person", "mask": {"size": [500, 333], "counts": "Th]19V?9J3M3N1O100O02O0O2M3M4IX`^3"}}, {"score": 0.08524665981531143, "bbox": [129.0, 207.1, 73.7, 93.2], "category": "potted plant", "mask": {"size": [500, 333], "counts": "mPX24\\?5H7O2M2O2N1N2N3O1N3M1O2O1N100010O0O2O0000000000010O010N3L3M3O3M3N01O10FSAJT?Mn@2S?Ko@5ioW2"}}, {"score": 0.0836317166686058, "bbox": [101.3, 311.4, 55.1, 95.1], "category": "person", "mask": {"size": [500, 333], "counts": "`mj17Y?9J6I7J2M6K4L2N100O11M2N2N2O1O2M2O2L4M4M3K5L7Eink2"}}, {"score": 0.08311937749385834, "bbox": [201.6, 219.4, 66.3, 133.3], "category": "potted plant", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.08186047524213791, "bbox": [178.6, 204.4, 46.5, 175.9], "category": "potted plant", "mask": {"size": [500, 333], "counts": "j`o29n>k0gNZOaBo0Y=k0L2O111M11N2L8I6K3N6Gd0\\O5J3L:EnXj1"}}, {"score": 0.08172310143709183, "bbox": [291.1, 335.1, 41.9, 87.9], "category": "potted plant", "mask": {"size": [500, 333], "counts": "dl`4a0k>b0A9I5L4K4N3L3N2N101N1O101O000O100001O0O10001N101N2N2M3M4L4K7I8D]c1"}}, {"score": 0.07987207919359207, "bbox": [169.9, 235.3, 90.8, 84.1], "category": "potted plant", "mask": {"size": [500, 333], "counts": "[XY32a?10001O0001O0Oj]e1"}}, {"score": 0.07970231026411057, "bbox": [78.1, 236.9, 73.8, 97.4], "category": "potted plant", "mask": {"size": [500, 333], "counts": "g_e14X?9M2O4N00O1O2N1N2Gm@OT?On@0fVX3"}}, {"score": 0.07964340597391129, "bbox": [135.4, 317.4, 7.2, 15.5], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.07952380925416946, "bbox": [119.8, 321.3, 14.9, 32.2], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.07941421121358871, "bbox": [161.8, 323.7, 10.3, 10.0], "category": "car", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.07934436202049255, "bbox": [90.1, 298.4, 112.6, 104.9], "category": "person", "mask": {"size": [500, 333], "counts": "TgR23_?3O0O01M3O100OTok2"}}, {"score": 0.07828323543071747, "bbox": [140.4, 323.8, 7.2, 16.0], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.07696278393268585, "bbox": [75.9, 337.4, 29.8, 44.1], "category": "person", "mask": {"size": [500, 333], "counts": "nkX1>S?5L3N1O100O10000000O2O0O2M5JP]b3"}}, {"score": 0.07683099806308746, "bbox": [133.8, 329.5, 8.1, 20.3], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.07637376338243484, "bbox": [159.8, 323.1, 7.8, 9.2], "category": "car", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.07564042508602142, "bbox": [96.6, 207.0, 75.0, 86.5], "category": "potted plant", "mask": {"size": [500, 333], "counts": "Rlh14^?;F6K4L2N2N2O1O002N10O1O002M2N3M3L3M5K5L6I6J8GnXS3"}}, {"score": 0.07276112586259842, "bbox": [136.0, 264.2, 103.5, 124.0], "category": "potted plant", "mask": {"size": [500, 333], "counts": "mmT34W?;M2O1O1O10000O010O01O1O010O1O2M2N3L3Ijlc1"}}, {"score": 0.07155277580022812, "bbox": [173.7, 327.2, 9.3, 9.7], "category": "car", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.07146043330430984, "bbox": [164.4, 272.9, 48.4, 115.1], "category": "potted plant", "mask": {"size": [500, 333], "counts": "`Uj25o>6SANj>d0M2OO0O2N4K3L3M5L231M0L3KhRQ2"}}, {"score": 0.07007556408643723, "bbox": [136.7, 320.0, 7.7, 14.6], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.06949877738952637, "bbox": [172.9, 326.0, 14.1, 15.2], "category": "car", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.06893869489431381, "bbox": [142.9, 319.3, 7.0, 13.7], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.0686916932463646, "bbox": [138.9, 142.0, 23.2, 57.2], "category": "traffic light", "mask": {"size": [500, 333], "counts": "m^V2j0g>8I4M2N2O0001O1O1N2N4K8DSie2"}}, {"score": 0.06749465316534042, "bbox": [123.4, 320.4, 11.7, 22.0], "category": "person", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.06610989570617676, "bbox": [168.6, 328.9, 11.1, 12.1], "category": "car", "mask": {"size": [500, 333], "counts": "TcR5"}}, {"score": 0.06579636037349701, "bbox": [192.2, 188.6, 54.4, 146.4], "category": "potted plant", "mask": {"size": [500, 333], "counts": "Re[34a?Njne1"}}]}, {"image_id": 7125, "dets": [{"score": 0.9481565356254578, "bbox": [22.0, 324.1, 107.1, 136.3], "category": "bicycle", "mask": {"size": [471, 640], "counts": "al;c0o=b0@8H7J3K6K4M4L3M3M2O1O1O1N2O2N1O1O1N2O100O1O10O01O100O010O010O01O001O00100O1O1O1O100O1O001O1O2N1O1O1O1O100O100O1N2N3M2O1O1N2O2N1O2O0N3N2M3N3M2O1N2N3K5K8I6J5L8CmWe7"}}, {"score": 0.9457718729972839, "bbox": [12.1, 230.4, 112.5, 173.5], "category": "person", "mask": {"size": [471, 640], "counts": "Wod04]>:J5J5K4L4M4L3M3M3M5K4L4K5K5L4L5L3N2M3M2O2N1N2O1O0O2O1O1O001N200O1O100O2N100O100O2O001O0100000O01O100O1O10OO1O2O0O2O0O1M4L3L5I6L5K5K6I7H8J6J6I:E`0ZOR[_7"}}, {"score": 0.4501323699951172, "bbox": [311.6, 230.7, 23.6, 64.4], "category": "person", "mask": {"size": [471, 640], "counts": "cQb4:Z>:G7I5K4M2O0001N3M2L5J7HX[^4"}}, {"score": 0.38191086053848267, "bbox": [45.0, 387.0, 146.9, 84.0], "category": "bicycle", "mask": {"size": [471, 640], "counts": "lih0:L5K4M2N1O2N2N1O2N100O2O0O100O1O10000O10001O0O100000O0100O100O1000O100000000O100000000O1000000O10O10O100O100O10000O00100O100000O100000000O10O10000O010O100001O000O1000000O100O1000001N10000000000000000000000001O000O10000O10000O2O0O2N2N2N102M2N2N3L4L4L4M4FhAMoTc6"}}, {"score": 0.21056951582431793, "bbox": [300.6, 229.4, 18.4, 55.0], "category": "person", "mask": {"size": [471, 640], "counts": "Y^]4:Z>9I3N00O3L4Kife4"}}, {"score": 0.18508988618850708, "bbox": [291.3, 231.0, 16.9, 48.9], "category": "person", "mask": {"size": [471, 640], "counts": "ahY4;[>11N[Wk4"}}, {"score": 0.17722219228744507, "bbox": [41.2, 335.2, 68.9, 86.7], "category": "bicycle", "mask": {"size": [471, 640], "counts": "aei06Z>>B:I6K4M3M3M3M3N1O100O2O0O10000O10000001O000000001N101O001N101O1N2N2O3K5L4K6J6I\\ki7"}}, {"score": 0.14345361292362213, "bbox": [264.3, 231.7, 16.3, 45.6], "category": "person", "mask": {"size": [471, 640], "counts": "h]l35_>8J3OLbSX5"}}, {"score": 0.12510931491851807, "bbox": [286.9, 234.2, 14.3, 39.4], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.11366352438926697, "bbox": [381.8, 241.3, 20.2, 29.9], "category": "person", "mask": {"size": [471, 640], "counts": "hfb59\\>3N2O00000001N3Kl`_3"}}, {"score": 0.10501698404550552, "bbox": [298.8, 234.6, 15.2, 39.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.09638545662164688, "bbox": [266.1, 237.4, 11.5, 24.2], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.09364248812198639, "bbox": [18.9, 328.3, 69.4, 87.4], "category": "bicycle", "mask": {"size": [471, 640], "counts": "nm`02]>;F8K5L4M2N3M2N3N1O001O10000O100O1O100O10000000000000000001O1N102M3M3L5I8I;Ag`S8"}}, {"score": 0.09298816323280334, "bbox": [307.8, 234.3, 17.6, 38.7], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.09066575765609741, "bbox": [273.5, 235.1, 10.1, 18.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.0882878303527832, "bbox": [269.4, 236.9, 12.1, 24.3], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08822048455476761, "bbox": [290.9, 235.9, 10.7, 20.4], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08802320808172226, "bbox": [256.2, 239.2, 11.8, 28.2], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08618372678756714, "bbox": [272.6, 240.5, 11.8, 24.9], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08602041006088257, "bbox": [285.0, 234.7, 10.5, 18.6], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.0853605642914772, "bbox": [323.6, 241.5, 17.2, 45.3], "category": "person", "mask": {"size": [471, 640], "counts": "^bg43a>8K11Kjn\\4"}}, {"score": 0.08435170352458954, "bbox": [0.0, 234.9, 325.3, 161.1], "category": "person", "mask": {"size": [471, 640], "counts": "WYX11d>3M3N2N2M201N2O0O2N2O1N101N2O1N2O2M2O0000001O0O10000000000000000000001O01O1O1O1O010O001O01O01O0000010O000O2O2N1O1N101O0O100O2DPBOP>OUBNk=1XBLi=3>01MX[n6"}}, {"score": 0.08298880606889725, "bbox": [287.9, 237.1, 11.0, 23.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08224418014287949, "bbox": [265.0, 235.9, 9.6, 16.7], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08193187415599823, "bbox": [249.3, 239.2, 10.0, 18.7], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08116711676120758, "bbox": [305.1, 234.2, 11.0, 20.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08103228360414505, "bbox": [314.3, 234.9, 8.3, 13.4], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.08089426904916763, "bbox": [313.9, 234.4, 17.6, 39.1], "category": "person", "mask": {"size": [471, 640], "counts": "V[d43gPb4"}}, {"score": 0.08075782656669617, "bbox": [282.8, 241.2, 11.8, 23.8], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07997600734233856, "bbox": [220.0, 224.9, 186.8, 122.5], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.0797334760427475, "bbox": [268.4, 245.4, 14.8, 32.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07843126356601715, "bbox": [290.5, 235.2, 8.7, 14.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07827509939670563, "bbox": [317.3, 234.3, 11.6, 20.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07773595303297043, "bbox": [0.0, 322.2, 64.4, 148.8], "category": "bicycle", "mask": {"size": [471, 640], "counts": "QZ1n0c=7K5K3N2N2N100O2O00000001O0O102M2N4J8G]ba7"}}, {"score": 0.07487154752016068, "bbox": [298.1, 234.8, 10.1, 18.1], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07476241141557693, "bbox": [263.6, 246.2, 12.7, 28.6], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.0746501162648201, "bbox": [374.0, 244.0, 15.5, 22.1], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07400096952915192, "bbox": [383.0, 243.3, 14.3, 19.9], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.0736982673406601, "bbox": [258.0, 218.9, 126.9, 90.5], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.0735945925116539, "bbox": [55.5, 338.0, 137.7, 113.3], "category": "bicycle", "mask": {"size": [471, 640], "counts": "YgT15a>3ELnA6o=;F^O\\Bd0a=@\\Ba0a=C^B>_=>O101N101O1O2N0000O10001N100O2N2N1O3M2N2O1N101O007H2O1N1N2O1N2O1O101N2M3M^]]7"}}, {"score": 0.07285694777965546, "bbox": [362.5, 243.1, 13.6, 27.0], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07259279489517212, "bbox": [319.5, 233.4, 8.7, 11.9], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07240854203701019, "bbox": [245.1, 239.6, 10.5, 20.8], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}, {"score": 0.07214810699224472, "bbox": [290.0, 248.0, 14.1, 31.2], "category": "person", "mask": {"size": [471, 640], "counts": "P\\V9"}}]}, {"image_id": 13882, "dets": [{"score": 0.8088568449020386, "bbox": [49.3, 76.4, 320.5, 384.4], "category": "cat", "mask": {"size": [640, 426], "counts": "\\n]15fc07H8H9H7K5J6K6K4L3M4L3L5L4K5L3L5L2N2O1N3M2O1N2N3M2O1N2N3M2M3N2N2N2M2N3N2N1O2O1N101N2O0O2O1O0O2O1O1O1O0O2O1O1O1O1N2O1O1O1O1O1N2O001O1N1O2N1O2O1N1O2N1O2N101N1O2M2O2N2N2N2N2N2O0O2O1O1O1O1O001O1O1O1O100O1O1O2O0O1O1O1O1O1N2N2O1N2O1N101O10O01O010O010O10O10O01000O1O2O0O101N100O2O0O2O0O2O0O10000000O1O1O1O1O1O1O1O101N1O100O100O101N101N101N2O0O2O1N2O1N2O1N2O1N2O0O2N2N2N2N2N2N2M3N2M4L3M4L5K6L4K4L5L:Ee0[Oe0[O=D4K6J5J7J<`NS^O6Pc0J7J4KlZj2"}}, {"score": 0.8073158860206604, "bbox": [0.0, 95.0, 102.2, 219.7], "category": "toilet", "mask": {"size": [640, 426], "counts": "Y3f5Z>000O2N2N2N2I601O1O1O1O1O1O1O001O1O001O1O001O001O00001O000000000000000000O1000000O10000O100000000O1O1O1O1O1O1O1O1O100O7H2O1O2N1N2N3M2M3N3M2M4M3L3N3M2N3M3M2M4M3N2M4L3M3M4L3N3L4L5J5L5J6I9G8G;D<]Ob_]6"}}, {"score": 0.4196690618991852, "bbox": [204.1, 6.7, 198.1, 341.1], "category": "laptop", "mask": {"size": [640, 426], "counts": "[PT5;ac0`:TOTFe0m9ROdFe0]9TOQGd0P9VO[Gc0f8XOeGb0\\8[OnG=S8AWH5k7J^HLd73cHE_79gH_O]7`0iHSO_7l0gHiN_7V1hHXNd7f1\\5O001O2M2N2N3M2O1O2N1O2N1O2M3M2M4K6K5J7J5H9H9Gffg0"}}, {"score": 0.12315364927053452, "bbox": [277.2, 165.1, 85.4, 151.3], "category": "keyboard", "mask": {"size": [640, 426], "counts": "d_e5h0kb0g0@:G9F9H7F9L4N3L3N2O1N2N2N1O2O1N2O1O001O000000000001O001O1O1O1O001O1O1N3N2N2N2M3N2N2M4M3L4M3L3M3N3L5L3M2L5K5K8I8Gh0SO_k_1"}}, {"score": 0.11757747828960419, "bbox": [0.0, 11.2, 115.6, 209.0], "category": "toilet", "mask": {"size": [640, 426], "counts": "d0\\5d>0000001O0O10001O000O2O0O1010O101N101N2O1N1O2O1N2N8I>A;F3L5K3N1N2N3M2N2N100N3N1N2O1O1O2M3N2N2N3L2O2O0O10001O0N2N2N2O0O2O00O1000O1000O100O01000O1O1O100O1000O001O1O1O0O2M3N2N2N2M4M3M3M3N2N2M3N3K5L3K6K4]OR]OOob[6"}}, {"score": 0.08908180147409439, "bbox": [123.0, 79.4, 58.1, 82.2], "category": "handbag", "mask": {"size": [640, 426], "counts": "][a2i0Sc08H7K4L4M3M2N2N2N2O1N101N101O0O10001O01O0000001O00001O1O1O0O2O1O1O1N2O1N3N1N3M2N3M4K5K6I>\\OXTl4"}}, {"score": 0.08811136335134506, "bbox": [9.5, 435.1, 342.4, 195.9], "category": "keyboard", "mask": {"size": [640, 426], "counts": "iX>2b0Ohb03V]OOgb04V]OOgb05V]OMgb06W]OLXb03a]O44LUb0h0i]OZOTb0[1N2U^O`NSa0b1g^OeNUa0V2M3M3M2N3M5K5K2O1N2N2O0O101N100O2O0O10000O2N100O101N100O2O0O100O101N100O1000O001O001O001O1O10O0100O100O010O100000001O00000000000000001O000O1000000O100O100O1O1O1O001O1N2O1O1O1O1O101N100O2M2N3M2N3M3N1O1O2O0O101N1000000000000000O1000000000000000000000O2O0O10001N100O2O000O2M2O2N1N3N10001O0001O01O0000001O00000000001O000O100O2N1O1N2N3L3M3O1O2N1O1O101N100O10000O2O0O100O1O2O0O1O2O0O101N2N1Cn\\OLTc0OR]ONob0OT]O0mb0MW]O0lb0LV]O4\\c0O0O2O0000001O`Wf3"}}, {"score": 0.08581019937992096, "bbox": [20.6, 35.1, 187.6, 290.9], "category": "toilet", "mask": {"size": [640, 426], "counts": "nb<_5a>000000001O1N2O1O1O1N101O001000O10O10O010O001000O010O10000O1000010O0001O00010O0000O10O10O10O100O100O100O1O100O1O1O100O2O0O1O001O1O1O1O100O1OO1000000000000000000O100000O10000000O010O001O001O00001O000000O10O10O100O1O01O1O2N2N1O2N1O1O0OO3K5O0001O000O1O1O02N3M3L3N3L3M2O1OO21N4M2N2N3N1N2I8D;M3N3_Mi_O5?ROl?f0j_ONh0SOb?k0k_OIaa03g^OZOha0b0n0M3L=CR]m4"}}, {"score": 0.08571020513772964, "bbox": [197.4, 408.3, 166.8, 88.7], "category": "keyboard", "mask": {"size": [640, 426], "counts": "ZRS42mc02N2L4L5M3M3N1N2O1N101N2N2N2N2N101N100O101N10000O2O0O10000O1000000O100O100O101O000O100000000O10000O1000000000000000000O2O000O10000000000O10000000000000000O10000000000000O1000O1000000O1000000000000O100000000001O00000000000000O10000000O10001O0000000O1000001O0O101O0O10001N101O0O101N2N1O3N1N2N4L3M3L5K4K30000000SZV1"}}, {"score": 0.08055723458528519, "bbox": [359.6, 197.4, 32.8, 135.4], "category": "book", "mask": {"size": [640, 426], "counts": "\\cU7Y1\\b0m0[O8H7K3M3M2O1N2O1O1OO1O2O1N2M4K7G:Dm0dNkTh0"}}, {"score": 0.07827956974506378, "bbox": [0.0, 110.5, 85.4, 83.5], "category": "sink", "mask": {"size": [640, 426], "counts": "^X3c0Wc0;G9I4L4L4M3M2O1N2N101N2N101O0O101O0O101O00000O101O0000000000000000000000000000000000000000000000O2O0000000O2O000O2O000O2O0O2O0O2N2N101N2N3L3N3L4L5J8Ei_g6"}}, {"score": 0.07534623146057129, "bbox": [13.1, 362.6, 244.7, 220.0], "category": "keyboard", "mask": {"size": [640, 426], "counts": "Sa8i3W`000000000000O2N2N3M1O2O1N2N1O2N1O2O0000001N100000001O0000001OO10O1000O1000O1000O1000000O10000O100O100O1O1O10000O01000O010O10O0100O010O2O0O101N100O101N10000O2OO1000O100O100O100O10O01O2O000O10000O2O000O100000000O10000O1000O0100O10000O100O1O100O10000O100O10000O100O010O100O1O10O01O10000O100000000O100000000O100O1O1O101N1O100O100O100O10000O01000O10000O10000O101N101N1O2O0O2N100O2N100O2N100O2N101N1O3M2M3L5L4L4L4M3M2N3L4M3L9G[jm3"}}, {"score": 0.07523778825998306, "bbox": [98.7, 18.9, 273.1, 241.8], "category": "chair", "mask": {"size": [640, 426], "counts": "^Rj27Vc0d0M3N1N3N1N3N2N2N2M4N2M2N3M2N2O0O100O2O0O100O2O0O2O1O1N3N1N101N101N1O2O0O2O1N2N2O1N2N2N2O1N1O2I601O00001O00001O000000001O0000000000001O00000000001O0000000000001O0000000000O10000000000O10000000000O10000000000O1000000000000O100O100O1O100K5J6N2O1O7I1O2N100O2N1O2N1O2O1N1O2M3N1O2N2M3N2M2N3M3N2N2N2M3N2N2O1N2N1L5L4K5DkR^2"}}, {"score": 0.06777101755142212, "bbox": [69.4, 62.8, 65.0, 169.8], "category": "toilet", "mask": {"size": [640, 426], "counts": "Xk`1h0Rc0P1nN`0F4J9H9H7J5K5K5L3L5L3N3M2M3O0O2N2O0O2N101N2O001N101O001N101O001O1O00001O001O001O00001O01O01O001O001O001O001O001O010O0010O01O010O00100O010O00100O0010O0100O0010O01O01O01O010O00100O0010O01O10O010O01O001O010O1O001O10O01O0010O01O010O010O0010O01O0010O0001O0010O01O010O00010O00010O0001O01O01O001O00010O001O001O001O001O1O001N101O1O1N101O1N2O001N2N2O1N2N3M2N2O2L3N3M2M4L4J7I8G;Bd0^OZXS1"}}, {"score": 0.8097001910209656, "bbox": [0.0, 68.4, 122.6, 177.0], "category": "toilet", "mask": {"size": [486, 640], "counts": "Q3b2d<000N4K7J5K5L3L4M3M3N2M3M3M3N1N2O1N3N1O1N3N1O1O2N1N2O2N1O1O1O1N101O1O1O1O1O001O1O000O2O001O01O01O00001O000000000001O01O00005K01O000000001O0000001O001O001O001O1O1J501O001N2O1O001N3N1O1N3N1O1N3N1N3N1N3N1N3M3M3M3M2N3M4K4L5J6K5K5J7I9E=B>@Tcg7"}}, {"score": 0.21771341562271118, "bbox": [486.4, 340.3, 89.1, 80.6], "category": "dog", "mask": {"size": [486, 640], "counts": "ZQ`7a0b>9G7I4N3M2N1O2N100O2O0O100O101O000O100000000O01000000O1000000O010O10000O100O100O1O100O1O1N2O2N1N2O1N3M2N3M3M3K5KbeT1"}}, {"score": 0.1472984403371811, "bbox": [482.4, 299.0, 95.2, 89.5], "category": "cat", "mask": {"size": [486, 640], "counts": "RZZ7d0^>8I6I6K5L3N2N2M2O2N2N101N1O101N100O10000O2O000O100000000O10001O000O10000000000000000O1000O100000000000000O101O00000O10000O2O0O101N101N2N2N1O3M2N3L4K7I6G>YOm]P1"}}, {"score": 0.12132811546325684, "bbox": [443.7, 287.7, 131.8, 137.5], "category": "cat", "mask": {"size": [486, 640], "counts": "Tal67l>6K3L4L4M3N2N2O1O1N2O1O1O1O1O1O100O2N1O1O1O100O101N2O0O2N2N101N3M2O2M1O2N1O2N1O1O1O2O1N1O1O1O100O1O1O1O1O1O010O001O1O010O1O00001O01O01O0001O00010O00001O0000001O00001N101O0O2O0O2N2M3N1O2N2N2N3M2M4M2M4K5K5K7I9B`[Q1"}}, {"score": 0.11833766102790833, "bbox": [329.4, 247.6, 145.6, 107.0], "category": "cat", "mask": {"size": [486, 640], "counts": "heZ5>f>3M4K4M4M3M3M2M2O2N2N1O1O2N1O101N10001N100O101N100O2O00001O001O000O100000000O1000000000001O0000000O0100000000001O0000000000000001O000O10001O00001O001O00001O0O2O00001N2O1O1O1O1N2O1O1N2O1N3M2N3M3M3L3L5L4LTed2"}}, {"score": 0.08850200474262238, "bbox": [116.3, 239.7, 83.6, 65.4], "category": "bird", "mask": {"size": [486, 640], "counts": "f\\o19k>4K3N3M2O1O1O1O1O2N1O1O1O1O100O100O100O10O0100O100O100O10O10O1000O01000O010O10O0100O100O1O100O1N3N1N3N2K5L6GmXf6"}}, {"score": 0.0742282047867775, "bbox": [0.0, 3.2, 101.4, 162.8], "category": "toilet", "mask": {"size": [486, 640], "counts": "a0U3Q<000O2N3N1O1N2O01N1O1O10001O010O10O0100O100O2O4K5L7H=C4L3M3N1N011O02O00O0O2O2N2M3M2O1O2M3OO002M01M2N3L4M2010O01O010N101M2L5K5L7J]i`8"}}, {"score": 0.06431513279676437, "bbox": [391.7, 252.0, 90.1, 85.8], "category": "cat", "mask": {"size": [486, 640], "counts": "Ucn5f0Z>:I7I5L3M4M1O2N2N2O0O2O0O101N1000001O000O10000000000000000000000001O000000000000001O000O101O000O2O000O2O001N101N101N2O2M2N2N3M4L3L4M5I9Fi\\`2"}}, {"score": 0.05640191212296486, "bbox": [0.0, 10.8, 210.6, 235.0], "category": "toilet", "mask": {"size": [486, 640], "counts": "n0Q5U:000000000O2N2N101N1O2N101N1O2O00001O001O010O00001O0010000O010O100O010O100O001O100O001O100O001O100O00100O1O010O1O10O01O100O0010O01O10O00010O010O1O01O01O010O000001O01O0001O00010O0001O00001O0001O01O000006J0O10000000000O1000000000000000000000000O100000000O10000000000O10000O100O1O1O100O1000J600O10000O10000O2O3M6I4M3M3L4M3L2O1N2O1N3M3M5L3L4L4M3L2N2N3L3N3L4M3L4L5J5K6I6L6K4K6K4K6I6J7I7I8H;YO[ac6"}}, {"score": 0.045486290007829666, "bbox": [0.0, 247.0, 180.3, 239.0], "category": "person", "mask": {"size": [486, 640], "counts": "m9Y5m9000000000001O001O001O1O0000O1O1O100O1O1O1O1001O00001O00001O001O1O1O1O001O001O001O001O1O001O1O001O1O1O001O1O1O1O001O1O3M1O1O1O1O1O1O002N2N3M2N2N1O2N1O1O1O1O1O1O1O001O1O001O00001O001O001O0000000000000000O100001O000000001O000000001O0000001O0000000000001O00001O3M2N000000001O0000002N2N2N2N1O2N1O2N1O6C>B>ZNiBj0Q>DifV7"}}, {"score": 0.04514412209391594, "bbox": [301.4, 233.5, 127.0, 101.9], "category": "cat", "mask": {"size": [486, 640], "counts": "T`T57n>3L3M2N3L3O1N3N1N2O2N1O2N1N2O2M2O1O1O1O1O100O2O00001N100O1O1O2N1O101N1000O0100O10000000000O1001N1000001O001O0O2O001N2O1N3M4L3M2M4L5K7J7HZ`Z3"}}, {"score": 0.04503950476646423, "bbox": [206.6, 29.0, 44.2, 102.1], "category": "person", "mask": {"size": [486, 640], "counts": "SSV3n0T>=D8I7I6J4M3M3M2N2O0O2O0000000001N2N1O2N2M4L4K5I8G<_OUkl5"}}, {"score": 0.0446353480219841, "bbox": [466.7, 276.8, 93.2, 84.5], "category": "cat", "mask": {"size": [486, 640], "counts": "fVR7b0`>:H7I4M4L3M2N3N1N2O1N2O0O2N101O001O001O0O101O0000001O00000000000000000000000001O000000000000000O1000000O2O000O2O0O2O0O101N1O2O1N1O2N2N2N2M4M3L4K8F^]Z1"}}, {"score": 0.04462585598230362, "bbox": [208.3, 23.0, 95.3, 120.4], "category": "person", "mask": {"size": [486, 640], "counts": "ScV34a>g0D?G6J5M3L3N2N1O2N1O2N100O100O2O000O10O100000000O1000000O2N100O2N2N1N3N2N3L3M5I8GbeS5"}}, {"score": 0.039184171706438065, "bbox": [181.5, 23.4, 36.2, 78.9], "category": "person", "mask": {"size": [486, 640], "counts": "TWj2250b>e0E9H5L4L4M2M101O0001O1O1N3K5J7Ho^]6"}}]}, {"image_id": 20671, "dets": [{"score": 0.9689025282859802, "bbox": [1.6, 8.1, 630.2, 458.4], "category": "car", "mask": {"size": [480, 640], "counts": "X?Z>f000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000E;O10000O10000O100O10000O100O1O100O1O100O100O1O100O1O100O1O100O1O100O1O100O1O100O1O1O1O1N2O1N2O1O1N2O1N2O1O1O1N2O1O1O1M3M3M3L4M3K5M3O1O1O1O1O1O1O100O1O1O100O1O1O100O1O1N2O1O1O1N2I7I7J6G9H8J6M3O1O1O1O1O1N2O1N2N2N2M3M3L4L4L4M3L4M3N2N2O1O1O1O1N2O1O1O1O1O1N2O1O1N2O1O1N2O1O1O1O1O1O1O100O100O1O100O1O1O100O1O1O1O1O1O1O1O1N2O1O1O1O100O1O1O1O100O100O10000O10000O10000O100O100O100O100O100O1O100O1O1O100O1O1O10000000000000000000000001O000000000000001O0000000000000000001O0000000000O1000000000000000000000000O10000000000000000O100000000000000001O001O00001O001O00001O001O001O001O001O00001O001O001O001O0000000000000000O100000000O1000000O1000000O100O100O100O100O1O1O1001O1O001O1O001O1O001O001O001O001O001O00001O00001O00001O0000001O00001O0c0@9L3K4M3K5L4L3M4L3N2N3M2O1O2N1N2O1O100O1O100O1O100O010O100O1000O010000O1000O10000O01000000O100000O10000O10000O10000O101N100O100O2N1O1O1O1O2O0O2N1O2N1O2M2N3N2M3M3M3L4K6H8I7GSWV5"}}, {"score": 0.4934457838535309, "bbox": [249.7, 90.4, 290.4, 362.4], "category": "person", "mask": {"size": [480, 640], "counts": "ZPa44i>8H8H6J6K4M2J5Eg0@8H:G=C;E;E4K5L5WDZMn:d3G7J3L4M2M3N3M3L:G7I2N2M4M3M3L5L4L3L5L3M3L4M3M4L4L3M2N1N201N101N101N2N2N3N1N2N2N2N1O2N1O1O1O1O100O1O100O1O1O2N100O010O10000O10O10O100O10O01000O10O01O01O01O00000010O01N2O2N1O2N1O2N1N3N1O1N3N1O1N3N1N2O1N2N1O2N6J3N3M3L3L5K4M4N2O0O2O0O2O0O2O1N101O0O2O1N1O101N1O2O0O2N1I8O1O001O1O1O1O1O1O1O001O001O001O001O001O001O00001O00001O00001O001O001O001O001O00001O001O001O001O1O001O1O1O1O1O1O001O1O001O1O001O1O1O001O2N1O1O1O00001O1O07JO100O101N100O101N10O10O010000001O1N3N1OO10000O10000O1O1O2O0O2N2O11N3N0OYN\\D3b;fN^Dd04h0\\;\\NkDb017GHo;J`D:E:^;XOPE;D`0[;oNVE?Ad0[7J5L3M4M1N3N2O0O2O1N100O101O0O100000001O0000000000000000000000000001O00000000001O00001N10001O1N101O1N2O1N3N2L5L3M4K<@hP\\1"}}, {"score": 0.13791412115097046, "bbox": [450.4, 5.5, 189.6, 134.0], "category": "car", "mask": {"size": [480, 640], "counts": "hSW7;b>4O2M2O1O2M2O1O1N101O1O1N2O1O2M3N1N2O1O001O1O001O1O1O1N2O1O1O001O1O1O1O1N200O001O100O100O1O100O10O010O0010O0100O00100O00100O2O1N3N2M2N3M8H1O1N2O1O1O1N5YOmA:]>O2O02M2O0O01O0]OkA<]>O1O1O101N2O0O10O1O2OO1N1LUlf0"}}, {"score": 0.13668467104434967, "bbox": [1.7, 31.2, 188.6, 416.9], "category": "car", "mask": {"size": [480, 640], "counts": "n`0RAh0YOT1kN=DBa0]Oi0VO=F7J5K4L3N2M3N2M200O2O0O10001N0100000001O00001N101O1N2O2M2O2M3M3L4J8G;@[Wj5"}}, {"score": 0.13296261429786682, "bbox": [368.8, 0.7, 180.5, 77.4], "category": "car", "mask": {"size": [480, 640], "counts": "`YV62l>4M101N1O3M3N1N101N10001O0000001O001O1O010O0O2O00001N10001O00001O00001O0000000O2O000000000000001O00000000000001O01O1O01O001O001O1O2N1O0000001O0KmA]OS>c0nA\\OS>b0nA^OS>1kA:3EX>9iAGX>89N4LUVP2"}}, {"score": 0.13278521597385406, "bbox": [335.2, 246.3, 43.9, 75.9], "category": "bottle", "mask": {"size": [480, 640], "counts": "Z`Q5h0T>?B;F5L2N3M101N2O0O1000000001O00001O1O1O1N2O2M4L4K6Ggjm3"}}, {"score": 0.12200597673654556, "bbox": [436.2, 0.3, 164.3, 86.2], "category": "car", "mask": {"size": [480, 640], "counts": "_RP72m>4L3N2M3M2N3M2O0O2O1N2O001N101O001N2O00001O00001O001O1O00000001O0001O000000001O000000000000000001O00000O101O00001O0O2O0O5L2M2M3N2M_o\\1"}}, {"score": 0.11196828633546829, "bbox": [354.4, 10.4, 219.3, 115.2], "category": "car", "mask": {"size": [480, 640], "counts": "mQZ67i>2M3N2M2O1O1N2O0O2N1O2N100O1O2O001O001O001O1O1O001N1010O01O001O1O3M3M001O1O0000000000000000000000000001O000010O0001O01O010O010O01O1O00010O001O0O2J6M3O1O1N2N1N4K5Mj_k1"}}, {"score": 0.11037629097700119, "bbox": [582.4, 6.1, 57.6, 78.6], "category": "potted plant", "mask": {"size": [480, 640], "counts": "Z]g85b>>G7K5K5L3M1O2N2N2N2O0O2O0O101O0000O10O1001N10001N1O1O2N1O3M2M3M4L5I;CSg3"}}, {"score": 0.09456644207239151, "bbox": [223.0, 341.8, 81.7, 60.3], "category": "handbag", "mask": {"size": [480, 640], "counts": "fd\\34d>?F6K4L3N2N2N2N1O2N101N101N10000O2O000O1000000O100000001O000000000000000000O100000001O0000001O00001N10001N101O1N101N2O1N2N2N3M3L6I8Ebno4"}}, {"score": 0.09297014027833939, "bbox": [279.7, 2.2, 210.5, 66.6], "category": "car", "mask": {"size": [480, 640], "counts": "Zom41o>1O5K1N2O1O0\\AH`>=O001O0O10000O100O2O0O1000000000001O00000000001O0000000000001O000000000000000000000000000000000000000000000000000001O00000O100000000000001O001O1N2O001O2N1N100N3N101N2Jd[S3"}}]}, {"image_id": 26654, "dets": [{"score": 0.9760717749595642, "bbox": [335.3, 135.6, 90.3, 110.7], "category": "dog", "mask": {"size": [340, 640], "counts": "kfb36V:f0_O7J5K7J4K4M3M3N2M3N1N3N1N2O1O1O0O2O1O001O001N2O001O000000000O10000000001O00000001O00000000000000000O2O000O2O0O1O2N1O2M2O2M2M4L5L3M4J5L6E=^O[WZ2"}}, {"score": 0.8716892004013062, "bbox": [0.0, 18.4, 625.2, 321.6], "category": "car", "mask": {"size": [340, 640], "counts": "Z1h7l200000000000000000000000000000O1N2O2N101N1O2O0O2N2O1O0O2O1N3N1O2N1N3N1O1O1N101O1O001N101O00001O1N2O2N3M2N3L4M3M8@8000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008H001O000000001O0000001O00001O00001O00001O00001O00001O00001O00001O00001O00001O00001O00001O0000001O00001O0000001O00001O001O00001O001O001O001O1O001O00000000000000000000000000000000im4"}}, {"score": 0.6130072474479675, "bbox": [71.0, 134.3, 192.2, 201.2], "category": "suitcase", "mask": {"size": [340, 640], "counts": "Rim0d0j9d0ZOB9H8H8H8I7I6K5K4L4L4L4L3M4M2M4L4L3N3L3N2M4M2N2N1O2N2N1N3N2N1O2N2N1O2N2O0O2N101N1O2O0O2O0O1O2O0O2O0O2N101N101N2O0O2O1N100O10000O100O10001O001N101O00001N101O00001O001N101O000010O01O1O001O001O000010O01O001O0010O01O00100O0010O00100O00100O1O1O100N102N1O1O1O1N3N1O2M3M3M3L5L3M4L5J6J6@a0E;FB6J5L5J5L4L5J5L3M4L4L2N3B>N1O1O1O2N1O1OB3M3L4M4L3M2O2M3M4K4L4K5K5J6J7I7J6J6J8H7H8F;B>E;F9G9E;C=B?@a0_Ob0_Oc0^Oa0_Oe0YOZ2\\M_Zh3"}}, {"score": 0.199271559715271, "bbox": [99.4, 302.0, 39.1, 77.9], "category": "clock", "mask": {"size": [640, 388], "counts": "ZZQ2h0Sc0:H6J6K4M2M4M1O2O1N101O0O100000000001N10001N1O2N3M2M4M3K6I:Dmim4"}}, {"score": 0.15595859289169312, "bbox": [22.3, 24.1, 189.2, 179.4], "category": "umbrella", "mask": {"size": [640, 388], "counts": "^_c0i0Sc0:G7J6H7J6J4M4M3L3N3L3N2M3M3N1O2M3N2N1O2N2N2M3N1O2M2O2N1N3N1O2M2O2N1N3N1O2M2O2N1O2N1O2N1O1O2N1O1O101N1O100O101N100O101N10000O2O0K50000000001O00000000000000001O00000000000000000000000000O10000O10000O10000000000000000O10000O1000000O1000000O100O100O105J1O1O2O0O1O2N101N1O2O0O2N2N1O2N2N2N2O1N2N2N2O1N2M3N2N3M2N3M2N3L4M3L5K4L5L3K6K6H7G:F?_O\\Qf3"}}, {"score": 0.08412851393222809, "bbox": [98.2, 398.5, 12.5, 44.5], "category": "person", "mask": {"size": [640, 388], "counts": "nhP2;cc04ONTk_5"}}, {"score": 0.07826267182826996, "bbox": [21.9, 43.6, 196.4, 376.0], "category": "potted plant", "mask": {"size": [640, 388], "counts": "_X=^2ba00000000M:D`0A8H6K6J5K5L4M2M4L3M4I6L4L4M3M4L3N2M3N1N3N1N3N1O2M2O1O2M2O2N1O2N2N2N2N2N2N2N1O2N1O1O1O1O1O1O2N1O1O2O1N2E;O1O1O1O2N1O001O001O001O001O001O1O1O1O2N1O2N1O2N1O2N1O1O1O001O1O001O1O1O2N1O1O1O1O001O001O001O1O1O1O1O1O1O2N1O1O001O001O001O00001O000:F1O001O1O1O001N2O1O1O010O1O1O1O1O1O1N2N2N2M4M2M4M3M3M2N3N2N3L3N1O2M2O1O1N3N1N3L3N3M3L4M3M3M3L4M2M4K5L4L4L4L5J5L5J6I7G:G9H9G8G:D>Ab0ZOl0lN`V]3"}}, {"score": 0.07038532197475433, "bbox": [93.4, 399.9, 11.0, 41.1], "category": "person", "mask": {"size": [640, 388], "counts": "P`b7"}}, {"score": 0.06693143397569656, "bbox": [105.9, 20.6, 109.6, 121.5], "category": "umbrella", "mask": {"size": [640, 388], "counts": "ob]25fc0;E8J3M4M3N1O2M2M4K4N3M2N3L3M3M3M4M2N2M4M2N2N2O2N1O1O1O1O101N1000000O1000O10000O100O100O100O100O1O1O1O1O2O000O2N1O2M2N3M3N1O2N2N3M2N2N3L3M5J9Cj0WOQZg3"}}, {"score": 0.06361984461545944, "bbox": [1.1, 24.4, 101.7, 419.6], "category": "potted plant", "mask": {"size": [640, 388], "counts": "Tg0V6j=000I9G9J6L4L4O0O01000O1N1N3M2O1N21N2N3M3L3N1N100O101ON3N2N2O2N1O01M3N2L4M3K5K5J6H8N2N100O1O0N4M2O2O00001O3M2N3M2N1O02N2N1O2O03M2O1N2N2N001O1O002N4J6J5K6I6L3N2N3M2N2M5K4K5L4L7H;A=CAm^W5"}}, {"score": 0.048700425773859024, "bbox": [93.4, 424.7, 13.9, 31.5], "category": "person", "mask": {"size": [640, 388], "counts": "P`b7"}}, {"score": 0.04640410095453262, "bbox": [90.2, 402.0, 8.3, 23.6], "category": "person", "mask": {"size": [640, 388], "counts": "P`b7"}}, {"score": 0.04627426341176033, "bbox": [61.2, 102.8, 72.5, 72.5], "category": "donut", "mask": {"size": [640, 388], "counts": "RdZ1`0[c0:I5K4L3N3M2N2N2N2N2O0O2N2O000O2O000O101O00000O10001O0000000000000001N100000001O000O2O001N101N101N2N2O1N2N3L3N3L5J7HC5F;F9I8I4M1000000000YM"}}, {"score": 0.04177384823560715, "bbox": [74.6, 0.0, 139.5, 89.2], "category": "umbrella", "mask": {"size": [640, 388], "counts": "oeg16hc04H7O2M1O2N2M2O100O2M2O1N2O2M200O1N2O1N2M3M3O1N2O1O0O2O1O1O1O1O1O100O1O1O10O01O1O100O010O100000O0100000000000O010000000O2O00000O10000O10000O2O000O10001N10000O2O00001N101O00001N100O101O0O101N101N2O001N1O2O1N1O2O1N2N101N2N2M3M3N2@f\\O4bbc3"}}]}, {"image_id": 37017, "dets": [{"score": 0.709408700466156, "bbox": [83.7, 140.3, 290.0, 325.3], "category": "dog", "mask": {"size": [480, 640], "counts": "nlZ1j0k=f0\\Oa0A=F:F8H7J6J6J6J5K6K5K4L5L3L4L4L4M3L4L4M3L4M3M3M3M3M2N3N1N3M2N3N1N3M3M2O2M2N3M3N1N3N1N3N2M2O2M3N2M3N2M3N2M4M2N2M3N2N2N2M2O2N1O2M2O2N1O2M2O2N1O1O1O1N2O1O1O1O1O001N2O1O001O1O0O2O009G00000000000O100000000000000000000000000000000000000000000000000001O000000000000000000001O000000000000001O0000001O000000001O0000001O0000001O00001O00001O001O00001O001O001O001O001O1O001O1O001O1O001O1O1O1O1O001O1O1O1O1O1O2N1O1O1O2N1O1O2N1O1O2N1O1O2N1O1O1O1O2F9N2N2N2O2M2N2N2O2M2N2N2N3M2N3M3M2M4L4L5K4L3M4L4L4L3M4L4K5L5J5K6J5J8I7H9G8H9F9H8F:D`0@`0_Of0TO\\YU4"}}, {"score": 0.141817107796669, "bbox": [0.0, 291.9, 143.4, 157.5], "category": "sink", "mask": {"size": [480, 640], "counts": "^:V3j;00000N3M3M3N1O2N1O2N1O1O2O0O100O100O1000000O10000O10000O01000O10000O01000O100O01000O10O10O1000O01000O10O010000O10O010000O1O10O0100O100O1O2N1O1O1O2N100N2O1O2N1O1O1O101N1O101N1O1O2N2M3N2M2O2M3N2K5I7L5J5L4L5K4L6J7G;AfQi7"}}, {"score": 0.1382063925266266, "bbox": [171.8, 0.0, 83.3, 58.9], "category": "book", "mask": {"size": [480, 640], "counts": "f\\d27`>`0H6I4N1N2O1N2O1N101O00001N1000001O00000000000000000000000000000000000000000001O000000000O10001O0000000O2O00000O2O001N2O1N2N2N3L6G`eh5"}}, {"score": 0.12520915269851685, "bbox": [407.8, 46.3, 87.6, 150.3], "category": "chair", "mask": {"size": [480, 640], "counts": "WPT6d0W>f0@7K3M2O1N2O001O0000000000000001N101O2M2N4K7H_nZ6"}}, {"score": 0.0904126986861229, "bbox": [405.3, 14.6, 174.0, 249.9], "category": "chair", "mask": {"size": [480, 640], "counts": "RPc69d>7J6J3L4M3M4M2N3M2O2M5L2M3M3N3L3M3N2M3N2N4K7J4L4K>C8H3L3N2N1O2N1O2VEXLX:[4L3N2N1O2N4K3N3M2N2N1N2O2N001N101N101N101O00000000000001O000O010O1O1O1O0O2O1O1001O001O001O000000O100O001O1O1O1O1N2N2N2M3M3N2N2O1O2K5I7J7K4gN_EnMf:h1iEoMZ:m1oEkMT:Q2a1K4I8_Oc0B`0E\\`T1"}}, {"score": 0.08836183696985245, "bbox": [0.4, 0.0, 84.9, 76.5], "category": "chair", "mask": {"size": [480, 640], "counts": "cW4l0P>:H4M4L3N1N2N101O0O101O000O10000000000O1000001O0000000000000000000000000000000000O10001O000000001O00000000001N10001N1O2O1N2N102L3M7G_[X8"}}, {"score": 0.08784817159175873, "bbox": [378.9, 60.8, 88.8, 152.4], "category": "chair", "mask": {"size": [480, 640], "counts": "oYh5d0U>a0C?B:F8I5K5K6J5K4L3N2M3N2N1N2O1O2N1O1N2O1N101O001O001O00001O0000001O001OO2O0O1000001O001O0O2N2N1O2N1O2O0O2N2M4M2M4L4K5K6H9I6Dj0^N]We2"}}, {"score": 0.08471328765153885, "bbox": [412.1, 2.4, 27.9, 50.3], "category": "bottle", "mask": {"size": [480, 640], "counts": "g_S67a>c0B7J3N2N1O2O0000000001O1N2N3M4K9BaRP3"}}, {"score": 0.08122306317090988, "bbox": [0.0, 50.5, 191.7, 366.8], "category": "oven", "mask": {"size": [480, 640], "counts": "T2[:e4000000001O000000001O00001O0001O001N2O00001O00000000O1000000O01000000O100O100O100O1O10O10O10O100O10000O10O1O100O1O1O1O1O100O010O1O010O001O1O001O10O10O10O100O1G810000O1O100O100O1O100O1O1O1O1O1O1O1O1N2O1N2N2N2N2N2M3M3M3M3L4M3M3L4L4J6G9J6iNmHZJZ7d5QIlIV7S6l0N2N2M3L4L4M3M3N2M3N2M3N2M3N2M3MEa0@ocm6"}}, {"score": 0.07910406589508057, "bbox": [343.3, 55.1, 78.2, 144.7], "category": "chair", "mask": {"size": [480, 640], "counts": "_PV5`0[>`0Ca0@9G5K4L6J6J4M4K3N3M3M2M3N2N3M3M2O0O1O1O1O1O1O010O0100O001O10O01O001N101O1O1O2N1N2O1N3N2M4L3M4L4L4L3L6J9E=C>BRW[3"}}, {"score": 0.07622530311346054, "bbox": [473.2, 73.1, 94.1, 141.4], "category": "chair", "mask": {"size": [480, 640], "counts": "iiU7P1f=c0D;E7I7J7I4M4K4N1N2N2N3M2N2O0O2O001N1O2O0O2O000O1000001N1000001O0O1000000000000000O1000001O0O1000000O1O2N1O2N1O2N1O2N1N3M3M3K6K4M4H9B?YOQ1mNP]U1"}}, {"score": 0.07091023027896881, "bbox": [438.1, 50.1, 102.4, 154.0], "category": "chair", "mask": {"size": [480, 640], "counts": "b]d6`0Z>`0C;E9I8H6J5K6K4K5K4M2M4M3M3L3N1O2M3N3M2O0O1O1O1O1O1O1O001O00001O000001O00000010O00001O01O0000001N101O0O2O00001N101O0O2N2N2M3N2N2N2N2N2N2M3M4J6K6H8Dl0UOjmc1"}}, {"score": 0.0703699141740799, "bbox": [207.1, 15.8, 67.6, 55.9], "category": "book", "mask": {"size": [480, 640], "counts": "SfV34f>:K4L3N1N2N2O1O2N001N101O001O001O00001O0000010O000000000001O0000001O1O001N2O1N2O1N2N4L3KlV`5"}}, {"score": 0.07031132280826569, "bbox": [88.2, 58.3, 527.5, 421.7], "category": "dining table", "mask": {"size": [480, 640], "counts": "ZZd2`0^>:G4L3M3L3N3L3N2M3N2M3N2M2L5K5L3M4M2N=D1O1O1N2O1N2N2N2O1N2O1O1O1O1O1N2O1O1O1O1O100O1O1O1O1O1O1N2O1N2N2N2N2N2O1O100O1O100O100O1O100O1O100O1O100O100O1O100O100O1O10000000000O10000000000O1000000000000O100000000000000O10000O100O100O100O1O1O1O1N2O1N2N2M3M3M3O1N2O1O1O1O100000000000000O1000000O10000O1O1O1O1M3L400O100O100O1000000O10000000000O10000000000O1000000O1000000O10000O100O1O100O1N2O1O1N2M3O1O1O1O1O1O100O10000O1000000O10000000000O10000000000O10000000000O100000000O1000000O1000000000000000000000000000000000000000000000000000000O100O100O10000O10000O10000O100000000O1O1N2iN\\HQKe7n4`HmJa7Q5cHlJ^7S5eHjJ\\7V5eHhJ\\7W5fHgJ[7Y5gHbJ\\7^5eH_J]7b5dHYJ_7g5eHRJ^7n5j000001O0000001O000000001O00001O0000001O00001O1O1O001O1O001O1O001O1O001O001O001O001O001O00001O001O00001O001O001O001O003M2N2N2N2N2N1O2N2N1O2N1O001O001O1O1O1O1O1O2N2N3M4L4L3M2N2N2N2N2N2N1O1O2N2N5K7I8H1O1O1O2N1O1O2N1O2N2N1O2N2N2N2N2N3M2N2N3M2N3M4L6J6J6J2N1O1O000000000000000000000000a[;"}}, {"score": 0.06369833648204803, "bbox": [367.0, 135.9, 212.9, 325.7], "category": "dining table", "mask": {"size": [480, 640], "counts": "bPY63g>:F9ZOf0J5K6I6K6J5L4L4M4K4M3M2N2N2N3M2M2N3K5L4M3M3N2M3N2N:F1O1O1O1O1N2O1N2N2M3M3N2M3N2M3N2M3N2N2N2O1N2M3N2F:G9N2N2N2N2O1N2WO_IQIc6c6TJPIn5n6S1M3M3M3M3M3N2N2N2O1N2N2O1O1N2O1O1O1O100O1O100O100O100O10000O10000O10000O100O1000000000000001O00000000000000000000001O0G9N2O2N2N2M3N2N2M3N2M3N2kKXIMl6DgI3d6TOSJc0X6]NjJU1d5RN]KS1k9UOg0UORnP1"}}, {"score": 0.062445998191833496, "bbox": [5.6, 270.0, 123.8, 109.8], "category": "sink", "mask": {"size": [480, 640], "counts": "lo4184X>g0A6H8K6K4K3N2M3N2N101O001N101O000O1O2N100O100O101N1000000000O010000000O1000000O0100000000000000000O100O10001N100O100O2O0O10001O00001O0O2N100O2O1N101N2N2M3N2N2O2M4M1N3M2M3J?BThm7"}}, {"score": 0.06008974090218544, "bbox": [333.7, 6.9, 188.6, 205.4], "category": "chair", "mask": {"size": [480, 640], "counts": "\\j_5:c>6L4K6J7I6J5L5K4L;FI5Kah\\2"}}, {"score": 0.05939947068691254, "bbox": [13.7, 3.1, 53.0, 53.8], "category": "chair", "mask": {"size": [480, 640], "counts": "cX;6K5K4N1O1N100O101OO1001O00000000O101O01O00000001N101O1M3N3M5H_ga8"}}, {"score": 0.058148931711912155, "bbox": [118.9, 0.0, 67.7, 59.0], "category": "bottle", "mask": {"size": [480, 640], "counts": "`_l17e>9J4L4L4M2M3N1N3N1O001O1N101O001O0000001O0000001O0000000001O000000001O0O2O001N101N2N2N2N4K5Ha_i6"}}, {"score": 0.05707545578479767, "bbox": [4.1, 323.0, 110.5, 75.7], "category": "sink", "mask": {"size": [480, 640], "counts": "Sm6`0Z>;G7K4L3N2O1N2N2O1O0O101N1000001N10000000001N1000000000000000000O10000000000000000000000O1000000000000000000000O1000000000000000000O100000000O10000O10000O2O000O2N100O1O2N1O2N2M3M4L5K7EZ`i7"}}, {"score": 0.0555306114256382, "bbox": [413.2, 96.4, 95.5, 141.2], "category": "chair", "mask": {"size": [480, 640], "counts": "ZfY6f0U>f0[O9I:F7I5L5K4L3M2N2N2N2N2N2O1O1N2O1O0O100O2O00001O1O0O101O0O1000000O101O000000000000O101O0000001N100O2O0O100O2O0N3N1O2N2N2M3M4L3M4K6H8F6J5M3L3N2M2O2N1O1O1O1O1O1N2O010O00001O100O000001O01O01O001O1O001O1O2N2M2M4M5K2N>^ObWX5"}}, {"score": 0.05231763422489166, "bbox": [483.9, 114.4, 79.1, 159.9], "category": "chair", "mask": {"size": [480, 640], "counts": "eRZ72f>m0SOg0\\O:F9G7J8I6L3L4L5K4M2N101N3M2O1N2N2N101N100O10001N101O0O10000000O1000000000001N100O2N1O2N101N1O2N2M4L3L5J7J6I9XOS1\\NQZV1"}}, {"score": 0.052141305059194565, "bbox": [311.3, 103.1, 156.4, 151.7], "category": "chair", "mask": {"size": [480, 640], "counts": "^mP54h>6E;N2N5K4K3N2N1O4L6J2N1O2N1O1N2O1O1O2N1O2N2N3M4L5K100O001O1O1O1O101N1O100O00100O001O1O1O1O001O1O001O010O0010O0001O0000000O1000000001O1O1O001O1O00000O10O100000O101O001N101N2O0O2N2N2N1O2N1O1O2N2N3M3L4L5K5J4Kd0\\O8Gb[i2"}}]}, {"image_id": 47263, "dets": [{"score": 0.9006879925727844, "bbox": [378.2, 92.7, 108.1, 99.0], "category": "car", "mask": {"size": [480, 640], "counts": "aec5c0A9K5J5J6K4M4M2M2N2O2M2O1O2N1O1O1O100O1O101N10000O1000000O10000000000O1000000000000000000O11O00000000000000001O0000000000001O0001O0001O0000001O0O2O1O001O001O1O1O1O1O1O1O2N1O2N1N3N1N4M2M3M3M3L5K6H:F;D5000[QX2"}}, {"score": 0.7907956838607788, "bbox": [341.1, 96.9, 33.0, 84.3], "category": "person", "mask": {"size": [480, 640], "counts": "jhR5a0Z>`0Ba0_O9H4L3M2O2N1O100000O2O1N2O2M3L5I6C>C?A`mn3"}}, {"score": 0.1330031007528305, "bbox": [523.8, 4.9, 92.5, 56.4], "category": "potted plant", "mask": {"size": [480, 640], "counts": "gZj7a0]>5K4M2M4N1N1O2O1N101O0O2O001O0O1000001O0O1000000000001O0000000O100000000000000000000000000000O101O000000000O1000001O0O10001N1O101N101N1O2N1O2N3L4K6J\\`>"}}, {"score": 0.12774568796157837, "bbox": [225.7, 188.7, 30.9, 42.9], "category": "sheep", "mask": {"size": [480, 640], "counts": "\\P\\37e>9I5K4M2O1N101N100000000000O101N1O2N2M4L4Jlee5"}}, {"score": 0.1256403923034668, "bbox": [231.3, 68.1, 27.5, 58.9], "category": "person", "mask": {"size": [480, 640], "counts": "dd_3d0Y>9I5J4M3M2O000001O1N3N2L5H<_O^Ze5"}}, {"score": 0.08343073725700378, "bbox": [96.2, 58.7, 27.6, 69.4], "category": "person", "mask": {"size": [480, 640], "counts": "`l_1e0V>`0C6J5K3N2N10000002M2N3M3L6E?YOicd7"}}, {"score": 0.08200515061616898, "bbox": [542.8, 13.9, 48.5, 43.3], "category": "potted plant", "mask": {"size": [480, 640], "counts": "nhR8=`>5M3L3O1N2N1O2O0O101O0O1000000000000000O10001O0O1O2O1N2M3N2M7FWWj0"}}, {"score": 0.07759971916675568, "bbox": [465.7, 115.0, 28.9, 49.4], "category": "car", "mask": {"size": [480, 640], "counts": "Rml68d>7K3L3N2O1N100O2O00001N10001O0000O1001O00000001N100000001N100O2O0O2N2N2N3L6IVm^8"}}, {"score": 0.07152868807315826, "bbox": [220.3, 65.8, 25.2, 55.1], "category": "person", "mask": {"size": [480, 640], "counts": "fnZ36e>;H7I4N1N2O01O002M3L7F_]k5"}}, {"score": 0.06544452905654907, "bbox": [469.8, 107.3, 19.3, 25.3], "category": "car", "mask": {"size": [480, 640], "counts": "edP71Z[[2"}}, {"score": 0.06512779742479324, "bbox": [430.6, 299.9, 178.5, 80.1], "category": "car", "mask": {"size": [480, 640], "counts": "dbm61n>2N1O2N101N1\\AJ]>=0001N1O2N1O2N2O00000O1000000O100O1O2O0000001N100000001N100O100000000O100000000000000000O11N101O00000O2O000000001O00000000001O0001O0001O000000001O0001O0000001O000000010O00001O00001O001O00001O0000001N10001O00001N2O1O1O1N2O1N2N3N1N2M3N2H9LY[c0"}}, {"score": 0.06395906209945679, "bbox": [440.8, 101.2, 37.5, 38.0], "category": "car", "mask": {"size": [480, 640], "counts": "_Qc69f>2N2N2O1N2O1O001O1O00010O01O001O1O1O1O2M4K\\d^2"}}, {"score": 0.06363372504711151, "bbox": [199.1, 55.6, 24.0, 25.9], "category": "car", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.0634830892086029, "bbox": [471.6, 116.7, 21.8, 25.9], "category": "car", "mask": {"size": [480, 640], "counts": "obQ72m>2Q_Y2"}}, {"score": 0.06331244856119156, "bbox": [569.3, 8.9, 56.3, 53.1], "category": "potted plant", "mask": {"size": [480, 640], "counts": "i[`8b0\\>5L3M2M3N2N2O001N101N10001O0O1000000000000001O00000O10000O2O0O2N2N2M3N3K7G[j9"}}, {"score": 0.062185220420360565, "bbox": [493.5, 262.4, 128.4, 111.2], "category": "car", "mask": {"size": [480, 640], "counts": "Tcc73h>8^AFn=1QBg0i=;M1N3N1O2N2N2N2N1O101N1O1O1O1O100O2O0O10000O100O1O10001O0O100000001N10000O101O00000O101O000000001O000000001O0O10000010O0000000010O01O00001O001O0000001O000O2O001O001O0O2N2N3M2N2N2M7I5K5Kc0YOPn:"}}, {"score": 0.061349593102931976, "bbox": [410.1, 283.1, 213.4, 64.8], "category": "car", "mask": {"size": [480, 640], "counts": "RTe61h>2\\ANb>5\\AMc>8O100O2O001N100O2O0O1O2O000000000000000000000O10O10O1000O1000000O1001N10000O2O00000000000O2O00000000000000001OO1000000000O1000000000000000000O101O000000001O00000010O0000000000O100000000000000O11O000001O000000000001O000000010O0000001O00000000000000000000001O00001O1O1O00001O00001O0O101N100O2O1N1O2O2M3M2N4LPh="}}, {"score": 0.06112436205148697, "bbox": [243.7, 72.7, 17.0, 28.8], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.06021162495017052, "bbox": [388.3, 100.4, 81.4, 47.1], "category": "car", "mask": {"size": [480, 640], "counts": "jgi5:c>5L3M3N2N1O2N100O1O2O0O10000O100000000O1000000000000000000000000000000000000000000000000001O0000001O00001O0O2O001O0O2O1O1N2O1N3M4K7GVkb2"}}, {"score": 0.059221938252449036, "bbox": [185.9, 59.9, 27.1, 22.4], "category": "car", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.05771750956773758, "bbox": [237.4, 64.0, 14.2, 30.2], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.05750373378396034, "bbox": [431.5, 250.9, 183.6, 79.3], "category": "car", "mask": {"size": [480, 640], "counts": "Sam62m>101N1O101ZAK^>6`AL`>;N100O101N1O1O2O0O101N100000001O0O101O001O00000000000000000O100O1O1O100O10000O010000000O100000O1001N101O0000001N100000001O0000000000001O0000000000001O000000000001O000010O00000001O00000000001O001O00001O00001O00001O0000001O00001N102N2M2N2O1N3M4L2M4KlQa0"}}, {"score": 0.05685345083475113, "bbox": [232.1, 68.8, 20.3, 34.2], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.0556815080344677, "bbox": [247.2, 70.2, 11.7, 22.1], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.05541396886110306, "bbox": [450.3, 99.3, 21.7, 23.5], "category": "car", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.05432945862412453, "bbox": [354.1, 221.4, 251.6, 199.4], "category": "car", "mask": {"size": [480, 640], "counts": "lko63l>3M3M2fAGj=b0N2AROjBP1U=oNkBR1a=1O`0_O4M2N1O1O2N3L4M4L5K2M2N2N1O2N2N2N1O2N1O2O0O2O1O2N3M3M3M2O0O2N1O2N1O00100O001O1O010O1O001O1O001O010O1O00010O001O00010O000010O0001O010O000010O01O01O00100O1O1O1O1O1O2N1O1O1O1N3N1N3N2M2N2N3N1N2N2O1N2N5K5J8G9Eh0VOa0^OVfe0"}}, {"score": 0.05402795597910881, "bbox": [95.0, 54.8, 17.7, 54.0], "category": "person", "mask": {"size": [480, 640], "counts": "U]_1=a>9I2N01N4Jlhi7"}}, {"score": 0.053893834352493286, "bbox": [25.6, 2.4, 81.6, 63.9], "category": "potted plant", "mask": {"size": [480, 640], "counts": "i[a05g>:H5L3L3N2O1N1O2O1N101O001N101O001O00001O001O0000000000000000000001O000000000O10001O000O1O2O0O1O2O0O2N2M3N3L7H7EZmo7"}}, {"score": 0.053839169442653656, "bbox": [96.3, 40.0, 22.5, 57.4], "category": "person", "mask": {"size": [480, 640], "counts": "kXa1?`>2N22LUkh7"}}, {"score": 0.053322333842515945, "bbox": [475.9, 102.1, 14.5, 22.5], "category": "car", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.052613772451877594, "bbox": [458.9, 97.9, 17.4, 19.9], "category": "car", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.05250180512666702, "bbox": [243.1, 72.2, 11.1, 17.3], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.05250130221247673, "bbox": [532.3, 8.6, 41.3, 51.0], "category": "potted plant", "mask": {"size": [480, 640], "counts": "lPo7=_>7L3L3N3N1N1O101N100000O1O1O2N1O2M5J6JZcS1"}}, {"score": 0.052319396287202835, "bbox": [406.7, 97.1, 51.1, 26.1], "category": "car", "mask": {"size": [480, 640], "counts": "\\UR67h>2N1O1O100O101O0O1000000O100000000000000000O2O000000001O00001O1N101N3N2Ke_h2"}}, {"score": 0.052106257528066635, "bbox": [91.2, 46.6, 16.3, 41.5], "category": "person", "mask": {"size": [480, 640], "counts": "P_^14l`m7"}}, {"score": 0.05203590169548988, "bbox": [322.7, 125.9, 16.3, 29.8], "category": "person", "mask": {"size": [480, 640], "counts": "YYk44k>0ih_4"}}, {"score": 0.05155792087316513, "bbox": [433.2, 96.6, 34.1, 24.0], "category": "car", "mask": {"size": [480, 640], "counts": "Yh_63k>201N10001O000000O101O010O1NhVe2"}}, {"score": 0.0504264235496521, "bbox": [437.4, 107.9, 46.4, 66.8], "category": "car", "mask": {"size": [480, 640], "counts": "of`6a0[>9H9H5L3M2N3N1N101O0000001O0O1001O0000010O0O101O001N2N3M2N3M4K6GSj[2"}}, {"score": 0.05026014521718025, "bbox": [514.0, 54.2, 104.2, 66.1], "category": "chair", "mask": {"size": [480, 640], "counts": "jdf75d>=G5L4L3M3N2N2N1O2N1O2O0O2N100O100O2O000O10000O10000O10001O000O100000000000000O10000000000000000000000000000001O000000000000001N100000001O0O101O001N1O2O0O2O1M3M4M3L4K6Ig`="}}, {"score": 0.04964280501008034, "bbox": [231.4, 69.6, 11.5, 17.6], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.04947434365749359, "bbox": [353.4, 103.8, 19.6, 35.9], "category": "person", "mask": {"size": [480, 640], "counts": "d\\X56g>7J3N2O01N2N4J`ko3"}}, {"score": 0.04946169629693031, "bbox": [385.9, 103.6, 14.6, 15.3], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.0494413897395134, "bbox": [98.9, 53.1, 16.0, 31.9], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.049385376274585724, "bbox": [359.5, 112.2, 27.6, 61.7], "category": "person", "mask": {"size": [480, 640], "counts": "ThZ5>]>?D7I4M2M3O0O100000O2O1M3N2L4J8GWYi3"}}, {"score": 0.049094926565885544, "bbox": [369.5, 102.6, 12.2, 16.2], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.04861941188573837, "bbox": [403.6, 101.3, 18.4, 15.0], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.04839828610420227, "bbox": [231.2, 302.4, 180.1, 77.8], "category": "car", "mask": {"size": [480, 640], "counts": "mnm33l>3H7N2N2O0O101N100O2O000O10000O1O1O1O1O101OO1000000000001O00O10000O10000O10000O100O10000O10000O10O10000000000O100O02O001O000O2O00000000000000000000000000000000000000000001O000010O000001O0O10001N101N101O001N2O000O2N2O1N101^OfA;`>01H_ANmQi3"}}, {"score": 0.048265907913446426, "bbox": [433.3, 102.3, 18.7, 15.8], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.04800510033965111, "bbox": [473.8, 117.8, 28.1, 36.1], "category": "car", "mask": {"size": [480, 640], "counts": "QRR76h>7J101N2OO1O2N2L4LSXU2"}}, {"score": 0.04776378348469734, "bbox": [420.3, 102.3, 17.3, 13.3], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.04766545072197914, "bbox": [388.4, 96.5, 11.9, 13.0], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}]}, {"image_id": 57703, "dets": [{"score": 0.9998639822006226, "bbox": [491.3, 50.7, 128.9, 314.9], "category": "person", "mask": {"size": [488, 640], "counts": "gTd7e0\\>d0]OEB;F9G9G9J7I6I8H7J6K4L5K5K5K5K6J6J7H7I8G=B?@nT>"}}, {"score": 0.9855355024337769, "bbox": [281.2, 72.3, 72.9, 143.9], "category": "person", "mask": {"size": [488, 640], "counts": "dkY41m>h0]O8J8H8I9G7I7I7I4L4L3M3N1N3M2N2M4K4M3M3N100010O0010O100O10001O100O1O010O1000O010O2N2N3M3M2M3N3L2M4L4L3L5J6I7I7I8H9F;E?]OeRZ4"}}, {"score": 0.9414498209953308, "bbox": [6.8, 48.7, 118.8, 337.1], "category": "person", "mask": {"size": [488, 640], "counts": "dZ6X1i=?B7J5M3L4M2O1N3N2M2O1N101N2O1O001N101O001O001O0O101O00000O2O0000000O100000000O100000001O000000000O100000000O010000000000O10000O1O100O1O100O1O1O2M2O1N2O2M2O2L4L3O2L4M4K5J7HbVX2"}}, {"score": 0.6553152799606323, "bbox": [263.9, 187.0, 112.6, 120.0], "category": "sheep", "mask": {"size": [488, 640], "counts": "S[R4g0\\>=G7I6J5L3M4L3N3L3N1O2M3N1O2N1O2N2N3M1O2N2N1O1O2N100O2N1O1O10O01O10O010O010O0100O100O00100O100O100O100O10000O100O10000O10001O01O010O0O2N3N1N4L7I7H6K6I7FQcl2"}}, {"score": 0.17217901349067688, "bbox": [2.3, 102.3, 88.6, 166.3], "category": "handbag", "mask": {"size": [488, 640], "counts": "kc::n>2M7J1O10O2N1O1O2M2N2HTeP9"}}, {"score": 0.11622042208909988, "bbox": [541.2, 81.8, 84.2, 129.9], "category": "backpack", "mask": {"size": [488, 640], "counts": "bl\\85o>6J7J7K4K2N3M2N3O1N2O2N2N2N1O00102M2O0O2O000000O1O2O2N000000000100ON1O2N101O1O001O2M2O1N2N2O2M3N1O1O6BQZ;"}}, {"score": 0.11029749363660812, "bbox": [230.4, 144.6, 82.4, 68.5], "category": "bench", "mask": {"size": [488, 640], "counts": "f\\e3;k>3N3M2N2N2N2O001O1O1O1O001O001O00001O00001O0000001O000001O000001O0001O0000001O0O101N101M3N2M4M4J7HehS5"}}, {"score": 0.10764151811599731, "bbox": [2.8, 169.3, 45.5, 106.5], "category": "tennis racket", "mask": {"size": [488, 640], "counts": "jk6a0b>>C:I5K4L3N1O2N1O10O10O02N1O1N3N2N2N4K7H8FZdo8"}}, {"score": 0.09188403189182281, "bbox": [492.1, 100.1, 85.6, 135.6], "category": "handbag", "mask": {"size": [488, 640], "counts": "Q\\i75R?4M2M5L8G6ION01GkAEV>;mABT>?kA@W>?lA]O^>;;M_n`1"}}, {"score": 0.08721879869699478, "bbox": [318.3, 259.2, 73.5, 50.7], "category": "sheep", "mask": {"size": [488, 640], "counts": "Xdk48l>9I4L3N2N2N2O0O2N101N1000001N10000000000000000O100000000000000000000001O0O100000000O100O2O000O101N1O2O1N1O2N2M4L6H_gi3"}}, {"score": 0.08416694402694702, "bbox": [13.8, 49.4, 86.7, 84.5], "category": "person", "mask": {"size": [488, 640], "counts": "[]=>f>:G6L4K4M2N3M2N2N2N2O1O0O2O000O101O0O1000000000000O20O0O100000000010O0001O0O2O001O001N2O1N2O1O1N2O1N3M3M5G:H>_OZlX8"}}, {"score": 0.0836942195892334, "bbox": [19.1, 54.5, 47.7, 64.0], "category": "person", "mask": {"size": [488, 640], "counts": "Rk>9k>f0]O9I8I2N2N101O0O11O1O1N2M3L6K5J9]OjaT9"}}, {"score": 0.07663829624652863, "bbox": [291.7, 83.7, 40.3, 54.9], "category": "person", "mask": {"size": [488, 640], "counts": "n\\b4`0d>4N2O1O2N0100O2N2M2N5G\\lh4"}}, {"score": 0.07296919077634811, "bbox": [49.4, 60.1, 47.1, 62.6], "category": "person", "mask": {"size": [488, 640], "counts": "dTm03Q?;H3M4M3L3N1O1O1O001O01O1O1O1O2N2M4K6GYZZ8"}}, {"score": 0.0725122019648552, "bbox": [69.4, 66.4, 39.9, 67.6], "category": "person", "mask": {"size": [488, 640], "counts": "VUW19m>=C5L3N1O1O0101N2M2O4K7HYdS8"}}, {"score": 0.07096704840660095, "bbox": [340.4, 81.7, 82.6, 131.9], "category": "person", "mask": {"size": [488, 640], "counts": "UXZ5?c>;H5K6J7I5L;E6I6J6L3M3N2M2N1O1O10O010001O001O01O001O1O1O1O2O0O2O1O1O0O2N1O2N1O2M3N2N2M3L3M4L5L4H;A=@P^^3"}}, {"score": 0.07025453448295593, "bbox": [242.1, 159.0, 64.8, 30.1], "category": "bench", "mask": {"size": [488, 640], "counts": "lmn32U?101O000001O0000000O2O00kk[5"}}, {"score": 0.07001303136348724, "bbox": [309.4, 84.5, 35.9, 51.0], "category": "person", "mask": {"size": [488, 640], "counts": "cai48n>5K5L2O0010O2N2M7CTA0eTc4"}}, {"score": 0.06990930438041687, "bbox": [256.4, 219.6, 46.7, 85.7], "category": "elephant", "mask": {"size": [488, 640], "counts": "l`n3?d>`0B9H8H5L6J2N2O1O1N2O00000O2O0001O0O1000000O2N1N3M3L4K6J7H9E>]O\\WS5"}}, {"score": 0.06774276494979858, "bbox": [542.9, 3.7, 83.2, 77.8], "category": "potted plant", "mask": {"size": [488, 640], "counts": "h`X81R?`0B7J4L4M2N3M2O1N2O1N2O1O001N2O1O00001O00001O0O101O000001O00000000000000001O00000O2O000O2O0O101N2N1O2M2O3M4K4K7GVZ="}}, {"score": 0.06712530553340912, "bbox": [252.4, 130.8, 89.3, 84.4], "category": "bench", "mask": {"size": [488, 640], "counts": "fVS4;k>6K4L2N3M3N2N1N2O1O1N2O0O2O001O0O2O1O00000000000000001O01O00001O00001O0000001N101O0N4L3M3M5J7I9Cm]f4"}}, {"score": 0.0670989379286766, "bbox": [562.1, 68.4, 43.6, 53.3], "category": "person", "mask": {"size": [488, 640], "counts": "d_d84T?2M3N1O3NO100J_fh0"}}, {"score": 0.06704379618167877, "bbox": [0.0, 67.1, 24.7, 77.2], "category": "person", "mask": {"size": [488, 640], "counts": "`n2c0]>^OniX9"}}, {"score": 0.06512000411748886, "bbox": [463.2, 186.2, 61.1, 51.3], "category": "baseball bat", "mask": {"size": [488, 640], "counts": "PPa9"}}]}, {"image_id": 57936, "dets": [{"score": 0.9825682640075684, "bbox": [286.2, 145.1, 249.7, 213.7], "category": "dog", "mask": {"size": [360, 640], "counts": "WYY3=f:;H5K5K4L5L2O2M3M3N2N2M2O2M3N1O1N3N2N3M3L3N2N2N2N3L3N2N2N2N2N2M?B4L4L>_IULi4Q4kJYLQ5S5L1O2N2N1N2O2N1O6KO00000000000000000000001O00001O0000001O0000001O00001O001O00001O001O1O1O1O2N1O2N2N1O2N2N2N2N2N2N2N2N3M2N2N2N2N1O2N1O2N1O2N2N1O2N2N2N1O1O1O2N1O1O1O1O1O1O1O1O001O1O1O1O1O001O1O1O1O1O1O1O1O1O001O1O1O1O001O1O001O1O001O1O001O1O001O1O001O1O1O001O1O001O001O1O1O001O1O1O001O1O001O1O001O1O001O1O1O1O1O001O1O1O1O001O1O1O001O1O1O2N1O2N1O2N2N1O2N2N2N1O4F;I6J6I6JjkY1"}}, {"score": 0.9728432893753052, "bbox": [471.9, 9.8, 167.9, 296.2], "category": "person", "mask": {"size": [360, 640], "counts": "llY5h1W9i0POj0B=B`0@`0@>B`0A>C:E9I8G8I8H7J6J3M3L3N2M4M2M3N2M3G9O1O1O008H00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008H1O001O1O1O001O1O001G9N2N2N2N2M4M3L5K5J6J7I6I8H8G6L300000^9"}}, {"score": 0.9044991135597229, "bbox": [174.5, 77.5, 184.9, 277.8], "category": "person", "mask": {"size": [360, 640], "counts": "llP29a0OY9d1[O=Bm0QHlLU6f3]IfLZ6[4H7J5J7J5J6J:F3M3M3N2M3O1N2N2O1O1N2N2N2N2N2N2O1N2N2O1O1O1O1O1N2O1O1O1O1O1M3N2N2N2O1O1N2O100O100O100O100O10000O100O100O10000O10000000000O1000000000000001O0000000000001O00001O00001O00001O1O0I8N2N101N3M2O2M3M4L4L<[KTLW1g4ZM^LT2b6I6J7YOgGRNc8a1T1ZOXmf3"}}, {"score": 0.3982588052749634, "bbox": [13.4, 0.0, 160.2, 360.0], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "Xb4]W[1;_SeN1N2N2N2O1N2N2M3M3H8Bg0SOe0ZOf0\\Nd1XOh0TOl0@a0H9F:E;_Ob0_Od0]Od0^OnfX5"}}, {"score": 0.26649144291877747, "bbox": [494.9, 263.3, 145.1, 96.7], "category": "dining table", "mask": {"size": [360, 640], "counts": "nRf5;l:3L7J1L4O1O1O1O1O100N2O1O100O100O100O1O100O1O100O100O100O100O1O10000O10000O100O10000O100O1O1O1O1N2O1O100O1O1N2O1N2O1O1O100O1O1O100O1O1O1O100O1O100O100O100O1O10000O1000000O1000000O10000000000000000000000000000000000O100000000001O00001O0000000000"}}, {"score": 0.2568615972995758, "bbox": [190.7, 42.6, 320.7, 308.6], "category": "person", "mask": {"size": [360, 640], "counts": "mV]2b0]:T1PO?Ba0^O8I7I7I6K5I7iH_L]6[4H8K4K5L4K5K5L4L4M3M3M:F2M3L4J6L4N2M3N2M3M3L4L4N2N2N2O1N2N2O1N2N2O1N2O1O1O1O1O100O100O100O100O10000O1000000000000000000O1000000000000001O00001O00001O001O001O001O1O001O001O1O001O1O1O1O1O1O1O1O1O1^IjLP5Z3hJkLW5e3UJ`Lj5j3gI]LW6\\4N2N3M4L6J6J4L3M2N2N2N2N1O2N2N4L5K4L2N2N1O2N1O2N1O1O1O2N1O1O2N1O1O2N1O1O2N1O2N2N1O2N1O1O2N1O001O1O1O1O1O1O001O1O1O1O001O001O1O1O001O1O1O001O001O1O001O001O001O001O001O001O00001O0000001O001O001O1O001O1O1O001O1O001O1O1O2F9O26IO1O1O1O1O1O1O1O1O1O3M3M4L4L3M2N2N2N2B?H7I^Xn1"}}, {"score": 0.1806502342224121, "bbox": [146.6, 33.0, 96.0, 144.7], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "]af1k0Y:9I6J5L4L4L3N1N5L4L1N3N2N1N3N3M2N2N2N2M4M2N2N2N6J4L4L2N2N1N3N1O1O1O1O001O000L5O000000000000000000O10000O100O100O1O1O1O1O1O1N2O1O1N2O5J2O1O1O2M2O1N3M2M4L4L3L5I7J7I7F;Ga0WO?N10bk[4"}}, {"score": 0.14118263125419617, "bbox": [205.5, 118.6, 94.1, 125.9], "category": "backpack", "mask": {"size": [360, 640], "counts": "ST\\25P;=C6J5L4L3M2N2M3N2O1O2O0O1O100O001OO1O2O10O01O1O1M2O2O1O0O1N3J5M3O2N3I6M3M3N10O1FHcE:]:90O000001O0O2O00010O01N2L301N2N3N101O1O3Llhn3"}}, {"score": 0.13345837593078613, "bbox": [103.6, 29.4, 145.1, 239.5], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "feU1>[:f0Eb0_O8I6J7J6E9M2N2N1O2N2N2N5K3M3M3M3M3M4L4L3M2jJlKT3W4TKoK?JO4\\4e4]K]K24^4c4XK`K61_4U5]KQK_4Q5^KRK`4m5N1O1O001O1O001O1O001O001O001O0000000000001O001O00001O001O00O100O10000O1000000001O000000O1O1O1O1M3N2O1N2N2N2M3N2N2O1N2L4I7L4M3M3_O]J[Ke5d4dJRK^5m4?N2O1M3L4N2O1N2M3N2M3M3M3N2M3N2N2M3N2L4L4M3M3N2O1N2N2O8G4J6K5K5K5K7G:Ed0XO]m\\4"}}, {"score": 0.11278590559959412, "bbox": [421.6, 4.5, 13.4, 44.7], "category": "book", "mask": {"size": [360, 640], "counts": "PPQ7"}}, {"score": 0.10306649655103683, "bbox": [499.9, 2.1, 129.8, 67.0], "category": "backpack", "mask": {"size": [360, 640], "counts": "jec5>Y:d0I5L4N2N2O1N1O2O0O2O0O101M20001O0000001O000000001O00000000000000000000001O00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000O100000000000000O100000000O3N001O0O2O1N2N2N2N4K7H5N0000Tk3"}}, {"score": 0.09908927232027054, "bbox": [175.4, 109.6, 90.6, 136.9], "category": "backpack", "mask": {"size": [360, 640], "counts": "SWQ2h0]:7J4K5L4K5L2O2M2N3N1O1O1N2N20001O0O2O1OO10O010O01O01O0010O1N2M3N1L4N2]ObF[Oa9b0gFXOY9d0g0N11MKlE^OP:e0SFXOl9i081N1O2O02O2N100O1G9000O10O1N2O0O2O3L6IWQZ4"}}, {"score": 0.09613148123025894, "bbox": [166.0, 8.4, 95.7, 119.2], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "VVm15Q;;E6J4N1N3M2N4M1O2N1N2O1O001O00001O1N2O2N1L4O1O03NO1O1O1O1O1O1O1L4O1O2N5TGmMY8c2N2N3M2N1O001O001O001O001O1O00000000000000O100000003M001O0O1O100O1O2N101N101N1M4K5K6I9jN^1[OVoV4"}}, {"score": 0.09463709592819214, "bbox": [431.7, 2.7, 13.4, 46.1], "category": "book", "mask": {"size": [360, 640], "counts": "mdi4;j:6OHbYV2"}}, {"score": 0.09302827715873718, "bbox": [392.6, 0.0, 72.0, 50.8], "category": "book", "mask": {"size": [360, 640], "counts": "lo\\4?f:6J5L3N101N2O0O2O0O100O101O000O1000000000000000000000000000000000000000O10000000001O000O101O000O2O0O2N2N3L6FeeP2"}}, {"score": 0.08969395607709885, "bbox": [309.6, 137.5, 101.4, 137.0], "category": "dog", "mask": {"size": [360, 640], "counts": "We`3h0\\:>fE\\OZ9]1K5K6J5L4L2N2N1O2N2O0O2N2O1N2O1O0000O1000O2O0O100000O100O1O2N1001O001OO2L3O101O0O2N2J5M4N2O0O2L5J5L5B?fNYFj0\\:E;FXdl2"}}, {"score": 0.08959902077913284, "bbox": [419.4, 2.4, 16.4, 27.1], "category": "book", "mask": {"size": [360, 640], "counts": "PPQ7"}}, {"score": 0.08776964247226715, "bbox": [313.6, 7.4, 81.0, 125.7], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "[Qd39k:7K5L3M3M2O5J4L2O2M3N2M4M3M1O1O3L3N1O1O1N2O3UGQNW8R2bGTN\\8_2L3NO100N2N200000000O100O1O1O100O101N10001N100N3M2N3L4K5L4J6G:J6E=F[bh2"}}, {"score": 0.08619269728660583, "bbox": [181.8, 0.0, 127.0, 102.8], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "fkS25P;5M3L5L3N001N10001N2O1O1O001O001N2O00001O0000001O000O2O00001O001O001O001O1O1O1O001L33NO001O1O2N2N5K1L4O1O1O005iFPNl8W2O1O1O1O1O0000000000001O00000000001O00001O000000000000O1000000O1000000O10000O4L1O1O1O1N3N1M4M3M2K6K5F:J7I6EV_f3"}}, {"score": 0.08570577204227448, "bbox": [0.0, 0.0, 104.7, 327.8], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "0W:Q1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000F:M3N2M3M3L4K=B7E:eK[4D=B?@a0F:E:_Ob0Af0ZOSln5"}}, {"score": 0.0851387232542038, "bbox": [111.8, 26.4, 102.4, 91.1], "category": "refrigerator", "mask": {"size": [360, 640], "counts": "XlY15l:Ca0^O:G8I7J6I7J5M3L4M2M4M2M3N3L3N3M2N2N3M2O1N101N2O1N101N101O0O101N10000O2O0O10001O0O1000001N100001O01O0000001O0000000001O000000000001O01O00010O00001O00001O0010O01O001O1N101O1O1O1O1O1N2O2N1N3N1O1O2M2O2N2M3N3L3N2M4L3N2M3M3M3M4K5L3L5K4L5K5J6I8H7H;CA?A`0]OVSZ3"}}, {"score": 0.811697244644165, "bbox": [178.8, 1.9, 187.3, 212.1], "category": "toilet", "mask": {"size": [480, 640], "counts": "]Xn2Q1h==H7ZCkNU;h2F8I6J6K4L3M3M4L3M4L4L4L4L3M3N2M3M3N2M2O2M3N1N3N1O2M2O001N101O0O2O1O000O2O0O2J50001O00001O00001O0000006J00000000000000000000000000000000000000000000000000000000000000K5O10000O10000O1000000O100O10000O100O1O100O100O1O1O1O1O1N2N2N2N2N2N2N2O1N2N2M3N2N2M3O1N2N2O1N2O1N2N2O1N2N2N2N2O1N2N2N8H3M2K6K5K5L4L5J5J7H8G9G:G8H?^OVhR4"}}, {"score": 0.279572069644928, "bbox": [139.8, 20.9, 177.4, 334.7], "category": "toilet", "mask": {"size": [480, 640], "counts": "geY2_1jN]OTio4"}}, {"score": 0.10515543073415756, "bbox": [203.6, 150.2, 104.9, 64.0], "category": "sink", "mask": {"size": [480, 640], "counts": "`RS3=`>8H5L4L4M2N2O1N2O0O2O0O2O001N10001N10000O2O0000000O101O000000000000000O10000000001O00000000000000000000O1000001O00000000000000000O101O00000O101O000O2O0O101O0O2O0O2N101N2N3L3M4M3L7EeXn4"}}, {"score": 0.102972112596035, "bbox": [55.3, 0.0, 359.2, 436.3], "category": "toilet", "mask": {"size": [480, 640], "counts": "\\ji0W1i=000000000000000dNUO`DP1^;^OnCm0m;X1K4L4L4L4L4L4L3M2N3M3M2N3M3M3M3M3M3M3M3M2N3M2N2N2N2N1O2N2N2N1O2N1O2N2N3M5K5K5K4L4L4L6ZIUIk4Q7iJXIT5S7YJXId5U8M3gJ\\GP4g8fKPHj3R8nKXHn3k7jK_HQ4]9K4LB6J4L5K4L2N1O1O2N1O1O1O001O1O1O001O1O1O1O1O001O1O1O001O1O1O;E000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000E;O1O1O1O1O1O1O1O1O1O1O1O1O1O1O1O1O1O1N2O1M3M3N2M3M3M3M3M3L4M3L4L4L4J6H8I7K5N2M3M3M3M3M3M3L4L4L4L4M3L4N2M3N2N2N2M3N2N2M3M3M3L4M3M3L4M3M3M3M3M3M3L4M3L4L4M3M3M3M3M3M3M3N2M3N2M3N2M3N2M3M3L4M3L4L4M3M3M3N2M3M`0@6I6H8I8F9D=_Ob0nNnde3"}}, {"score": 0.10020279884338379, "bbox": [249.4, 0.6, 120.9, 155.7], "category": "toilet", "mask": {"size": [480, 640], "counts": "jSn35h>?B6J5J6K5K5L;kB[NZ5L4L4mAYOf=R1N2N101N3M3N1N101N2O001N101N101M3O001O001O001O001O001O001O00001O0000001O00001O0000000000000000000000001O00000000000000000000000000000000O10000000000000000O100000000O1000000O100O10000O100O100O100O10000O100O10000O10000O100O100O100O1O1O1O1O1O3M2M2N3M3L4L4L4L4L4M200000f[Q4"}}, {"score": 0.09241195768117905, "bbox": [229.8, 344.1, 184.6, 135.9], "category": "dog", "mask": {"size": [480, 640], "counts": "Wog33k>6I5WOI`B=[=h0H7F;M1N2O2N1M3M3N3N1O1O2N1O1O1O1O1O1O1O1O100O10001N100O100O1O100O1O10O0100O1000000000000001O000000O01000000000O1000000000000000000000O1000000O10000O1O1O1O2O001O1O001O001O0O1FiCkMWi0XO=B=TCUOo:j2C=D5J6K5K7J7H6K5K4K6K4K6K4L4M3L3M3M2O2M2O1N2O1H8O001O001O001O001O000000000000O1000000O10000O10000O10000O100O100O1N2O1N2N9H1N2N3L3M3L4K5K6J5L5L4M2M4M3L3M4L4K5H8J7K5J6K5H8E:J7J5_Oc0J6I7H8eNfB<[>FWhl2"}}, {"score": 0.07686787843704224, "bbox": [149.2, 133.0, 146.3, 181.6], "category": "toilet", "mask": {"size": [480, 640], "counts": "ne[2>P>j0[Oe0B;F9I6J6L3M3M4L3M3M3M3N3L4M3L2O1O2M2O00100O1O00100O001O0000000O2O0000000O2O000001O2N2N2N1O1O1O1O1O1O2N1O2M2O2N2N1O2N2N2N1O1O1N2O1O2M2O2M3N2M3M3M3M2N2N2N1O2N2M5K4L4K:G9G5K6I8C\\Sf5"}}, {"score": 0.07431148737668991, "bbox": [301.9, 0.0, 110.5, 179.1], "category": "toilet", "mask": {"size": [480, 640], "counts": "n_f44g>`0E6J4L5J5K5L4L5K8Ia0^O7J3Lb0_O3L4M3L9H4K2O3H7N1O2N1O001O1O1O1O001O1O001O001O00000000000000000000O10000O1000000O10000O1N2O1O1O1O1N2L4M3N2N2M3N2N2L4M3N2O1N2N2N6I4K5YO`DcMj;T2f0I7M4J5J:F:C8J4M2N3N0O2O0O2O000O2O000000000000000000000000O1000001N100O2N1O2N2N2M3M[Ta4"}}, {"score": 0.06796098500490189, "bbox": [231.2, 89.3, 141.9, 127.0], "category": "toilet", "mask": {"size": [480, 640], "counts": "R_c39d>;E;D7I6K5K5J5L4K5M3N3L3M3M3M3M3N1O1N3N1O2N1O1O2N1O1O101N1O1O1O1O1O2N100O100O10000O10000O100000000O100000O10O1000000O1M300000O10000000000O10000O10000O1000000O100O1O1O1O100O1O1N2N2N2O1M3N2M3N2N2N2N2N2O4K3N1N2M4L3N3M2M4M3L4M3N2L4M2N3M3M3M6H9Gcko3"}}, {"score": 0.06765744090080261, "bbox": [224.2, 16.4, 208.3, 267.8], "category": "toilet", "mask": {"size": [480, 640], "counts": "Vdg34f>d0gB@o:n0QDSOL;i;_2G8H7I6K6I7K5K4L5K4L4L4K6K3N2M4M2M4M3L3M4M3L4M2M4M3M4L3M3N2M3M2O2M2O1N1I8O00001O001O1O001O1O1O1O1O1O1O1O1O001O0000000000000000000000O1000000O1000000O1000000000000O100000000O1O1O100O1O1O100O1O100O1O1O1O1N2O1N2M3N2M3N2N2N2N2N2N2N2N2N2O1N2O1N2N2O1N2N2N2O1N2N2N2N2N2N2N2N2M3N2N2N2M3N9F4L4L4L4L3M4J6K5K6J5L5J5L5J6I8I6K5K5K8F;Cf0ROTRW3"}}, {"score": 0.0638396143913269, "bbox": [172.2, 62.4, 302.2, 306.5], "category": "toilet", "mask": {"size": [480, 640], "counts": "]hW39b>=oNIcB`0S=l0I7L4M3M3L5L;E8G4M2M4M2M4M2M3M4M2L4M4L4K5M4K5L4K4M3M3M2N3M2N3M2M3L5L3N3L4M3M4M3L4L3N1N3N1N2O1G9O1O1O1O1O1O1O3M2N2N2N2N1O2N1O2N1O1O2N2N2N2N2N3M1O2N2N1O1O2N001O001O001O1O001O00001O000000001O000000000000001O00000000000000000000O1000000000000O1000000001O00000000001O000000000000000000000000000000000000000000O10000O10000O1O1O1O1O1N2O1O1O1N2N2O1N2O1N2N2M3M;E4_Oa0L3N3M3N1N3N2M2N3K5I7I6I8I8G8I7G:POP1B>G9H9H7J6I9H9F:E>A>@YZR3"}}, {"score": 0.06297429651021957, "bbox": [257.5, 14.8, 110.6, 35.5], "category": "bowl", "mask": {"size": [480, 640], "counts": "mfl37g>4L3O1N2N2O0O2O000O2O000O2O000000001N1000000000000O10001O000000000000000000000000000000000000000000000000000000000000000000000000000000000001N1000000000001O000O10001O000O101O0O101N2O0O2N2N2N5ITjQ4"}}]}, {"image_id": 68078, "dets": [{"score": 0.9540343284606934, "bbox": [164.7, 342.9, 189.0, 287.6], "category": "toilet", "mask": {"size": [640, 360], "counts": "Sbc3Q1mb08I5J:G1O1N2O1O1O1N2O1O1O1O1O1O1O1O1O1O100O1N2O1N2O1O1O1O1N2O1O1O1O1O1N200O1O1O1O1O100O1O1O1O1N2O1O1O1O1O1O1O1O1O1O1O1O1N2O1N2O1N2O1N2O1N2O1M3N2L4K5L4L4L4M3M3J6J6WOi0_Oa0@`0H8L4L4M3M3N2M3N2M3O1O1O1O1O100O100O100O100O100O10000O100O100O100O10000O1I7000000000000000000000000000G9O1O1O1O1O1O100O1O1O1O1O100O1O1O1O1O1O1O100O1O8H1O2N1N2O1O10000000RQ4"}}, {"score": 0.8259328603744507, "bbox": [142.4, 298.2, 168.0, 106.7], "category": "sink", "mask": {"size": [640, 360], "counts": "`Ri2?ac000000N4J6K5L5J6L4L2N2N2N2N2N2O1N2N2O1N2O001N2O001N2O001N101O001O1N101O001N101O001O001O1N101O00001O0000001O0000000O2O00001O00000010O00000001O000001O000001O0001O000000000000001O0000001O000000001O00001O001O0000001O0000001O00001O00001N101O00001O00001N101O001N10001N101O001N10001O0O2O000O2O000O2O0O2O0O2N1O2O1N1O2N2N2N3M3L5J6J6K5H6F900000oln0"}}, {"score": 0.4481736123561859, "bbox": [275.7, 280.0, 27.0, 67.9], "category": "bottle", "mask": {"size": [640, 360], "counts": "`Y_5P1jb09I6K4M3M2O2N100O1001O1O1N2N4L4L6I:AdjU1"}}, {"score": 0.15579140186309814, "bbox": [253.5, 344.0, 105.7, 210.9], "category": "toilet", "mask": {"size": [640, 360], "counts": "Wge5:Sc0f0\\Od0G9C?SOgMc_Od2Q`0k0_Oa0H8J5K5K4N3M3N2M5L3M2M3N2N3N1N3M3N1N2O1N2O1O1N100000000O2O5K0000000000000000000001O0000000000001I6O1O1O1O2N1N2O2M3N1N3N1000j8"}}, {"score": 0.14412224292755127, "bbox": [23.8, 487.4, 83.3, 141.1], "category": "dog", "mask": {"size": [640, 360], "counts": "loa0?\\c0\\5L2O2N5J01O1O1O1O1O1O1O2N1O1O2N1O1O1O1O2N1O2N2N3M2N2N3M3M3M2N1O2N2N2N1O3M2N3M3M2N3M2N2N2N2N2N2N2N2N2N1O2N1O1O2N1O2N1O2N2N1O2N1O1O1O1O2N2N3M2N2N2N2N2N1O2N2N2N2N2N2N2N1O2N3M3M4L5K4L5E9J6IhTU4"}}, {"score": 0.07651013880968094, "bbox": [217.5, 324.3, 89.5, 71.1], "category": "sink", "mask": {"size": [640, 360], "counts": "Ug[41ic0?B9I6L3L3N2O2M1O2O1N2O0O2O001N101O0O101O000O10001O00000000001O0O10000000000000O100000000000001O000O10000000001N1000001N1O101O0O1O2O0O2N1O2O1N2N2M3M3M5J6JVaT1"}}, {"score": 0.07585379481315613, "bbox": [213.2, 358.8, 89.9, 115.2], "category": "sink", "mask": {"size": [640, 360], "counts": "^m^41kc06F;JB=Z_OoNj=c2d@bMV?j2Y@_Md?b3L4M2O1O0O3N1N3M3N1O001N2O3M3M1N2O1O001O1O4L2N2OO00\\C"}}, {"score": 0.05282856523990631, "bbox": [138.6, 339.4, 50.8, 100.5], "category": "chair", "mask": {"size": [640, 360], "counts": "\\[j2e0Vc0b0@:H5J6K4L3N1N3N1O1O0O2O00001O00100O0000O1000001N101O001N1O2O1N2N3L4M3K6J6HeX^3"}}, {"score": 0.051506832242012024, "bbox": [0.0, 60.5, 113.9, 446.7], "category": "person", "mask": {"size": [640, 360], "counts": "l1Z=f60000000000000000O1000000O1001O000000000000O1000000O1O100O100O1000000000000O10000O100O1O1O1O100O100O10C>B>BR1jN`0ZOf0cN\\cV5"}}, {"score": 0.051058728247880936, "bbox": [259.3, 297.5, 15.8, 38.8], "category": "bottle", "mask": {"size": [640, 360], "counts": "kaV5:cc05OO1M5FZ\\O2kmg1"}}, {"score": 0.05100162327289581, "bbox": [286.6, 329.2, 72.7, 156.9], "category": "toilet", "mask": {"size": [640, 360], "counts": "kSh5R1eb0a0D9G8J7I5K4L5K5K4M4K5L3M2O2M3M3M3N2M2N2O2M2O1O1N2O001O0O2O001O001O0O2O001O001O4L0000000000000001O000000001O00000L400O2N1O2M2O10\\9"}}, {"score": 0.05049477517604828, "bbox": [0.0, 13.3, 211.8, 541.6], "category": "person", "mask": {"size": [640, 360], "counts": "=m`0S3000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000B>000000O100O100O100O100O1O100O1000000000000O1000000O1O1O100O1O1O1O1O1O1O1O100O1O1O1O1O1O1O1O1O1N2O1O1N2N2O1N2N2N2N2O1O1N2O1N2O?@4K5L4K5J4ZKgI`K_6k3ZJlKk5k2cKiLd4f2RLiLY4c2bLfJUMf0c6U4R7E:J7oMSAjNQ?Q1`A^Nf>^1Q2M4J5J6Gb^T4"}}, {"score": 0.05020910128951073, "bbox": [0.0, 332.7, 214.0, 196.5], "category": "chair", "mask": {"size": [640, 360], "counts": "eP6F6L3N2O01O1N3M5J9Fgnn0"}}]}, {"image_id": 70815, "dets": [{"score": 0.8298240303993225, "bbox": [169.5, 265.8, 80.9, 106.1], "category": "dog", "mask": {"size": [640, 427], "counts": "oQb3P1lb07G7J6L3N3M2M4N1N2O1O1O1O2N001O100O001O1O010O1O0010O0010O0100O0100O01O1O001O1O1O1N2O1O1N2O1N2N2N2N2O2M2O2M2O2M2M4M2N4L5I7I8FV_a3"}}, {"score": 0.8228387832641602, "bbox": [260.6, 132.0, 54.2, 119.8], "category": "person", "mask": {"size": [640, 427], "counts": "^UZ55cc0c0A>C;E9H6J4L4M2M3M200O2N101N10001O1O00010O01N2O0O3N1M4L3L5G:G8I9FB:F6K3N2M200O10O2N102L3N3L4K6F:D=C?^OQSW2"}}, {"score": 0.16015304625034332, "bbox": [245.1, 147.0, 49.1, 96.7], "category": "person", "mask": {"size": [640, 427], "counts": "`mS5j0Sc08J4K4M100O1O100000000001O0O2M3L4K6G;Ch^j2"}}, {"score": 0.15477655827999115, "bbox": [282.2, 3.8, 47.7, 69.7], "category": "person", "mask": {"size": [640, 427], "counts": "ndd5d0Xc09G7K4L3N3L3N1O2N100O2O0O1000000000000000O101O0O2O0O2N2N3M2M4L4K6I:CY[P2"}}, {"score": 0.12815631926059723, "bbox": [186.0, 89.3, 23.4, 113.3], "category": "person", "mask": {"size": [640, 427], "counts": "jSh3c0Uc0o0SO>D8J3N1O02O1L5K7Ba0WOoX[4"}}, {"score": 0.1271403282880783, "bbox": [266.2, 43.2, 65.2, 182.2], "category": "person", "mask": {"size": [640, 427], "counts": "Tk]5j0mb0b0Bb0_O8I7J;D:E6K3N3M2N3M2N010O2O00O101N2O001O1N2O1N3K4M3M4L3L4L5L4M3L4L4L5L3L5J6I8I:G>TOmaQ2"}}, {"score": 0.1255849301815033, "bbox": [242.3, 0.0, 95.3, 147.2], "category": "person", "mask": {"size": [640, 427], "counts": "T[Q57Yc0c0F:G9I5K5L4L3L4L5L3N2M3L4L4L3M3N3L3O1O2N1O1O101O00000M40O1O1O1O1O1O1N2N2O1N2O1O1N2O1L4M3N2O1N2N2N2N2N2M3O1N2N2O1N2N2N2O1N2O1N2N2N2N7H4L5K5J7Igcl1"}}, {"score": 0.11803192645311356, "bbox": [58.3, 0.0, 258.4, 193.4], "category": "person", "mask": {"size": [640, 427], "counts": "PlZ8"}}, {"score": 0.11758081614971161, "bbox": [40.2, 100.1, 125.8, 147.1], "category": "person", "mask": {"size": [640, 427], "counts": "cTg13lc02N2M2M3O10N2L4M3O100OfW\\6"}}, {"score": 0.11281603574752808, "bbox": [251.3, 150.6, 28.1, 69.8], "category": "person", "mask": {"size": [640, 427], "counts": "amS57cc0>F4M2N02N3M3FlnQ3"}}]}, {"image_id": 75283, "dets": [{"score": 0.9287164211273193, "bbox": [554.4, 77.7, 75.9, 135.4], "category": "person", "mask": {"size": [427, 640], "counts": "iQZ7i0\\<>D8I7I6K5J6K4K5L4K5L4L4L4M3M2N2N2O2M2O1N2O000O101O01O0000001O0010O0100O2N2O1N3M2N3M2O1N2N1O2N1O2N1O2M2O1O2M2O2M3M3M2N4L4J6J8G9D^f5"}}, {"score": 0.8432209491729736, "bbox": [339.8, 0.9, 172.6, 191.1], "category": "person", "mask": {"size": [427, 640], "counts": "ege4f0_C8H9H6K4L5J5L4L4K5L4K5K5K4M3M3M2N2O1N2O1N3N1N2O2M3N1O2M2O1O2M2O1O1N2O1O1N3O1N1O2N2N1O2N1O1O1O1O1O1O1O1O1O1O1O1O1O1O001O2N1O1O2N2O0O2N1O1O1O1O1O1O1O1O001O1O00100O1O00100O1O100O001O100O1O100O1O00100O1O00100O0010O01O1O010O1O1O100O10O01O100O001O10O00010O0000010O0000010O00000010O000000010O000010O0001O01O01O00010O001O01O01O0010O0001O0001O01O0000001O01O01O0000001O0001O0001O000000000000000000000001O0000000O100000000000000001O0000000000000000001O00000000000000000000000000000O100000O1000000000O1000000000O11O000000000O100000000000000000000000000000000O100000O1000000000000000O01000000000000000000O1000001O00000000000O10000000001O000000000O100000001O0O10000000000O2O0000000O101O0O1O101N100O101N100O101N1O101N100O101N1O2O1N1O2N2O1N2N2N2M3N2N2M3M3M3M3M3M3M3N2M3N10000000000000000000XI"}}, {"score": 0.6425513029098511, "bbox": [277.9, 10.7, 84.9, 163.3], "category": "person", "mask": {"size": [427, 640], "counts": "Wej34`<_1SOi0ROh0_O8I9G6J4M2N3M3M1O2N2O0O2O1N2K5O001O001O000000O10000O100O10000O100O1O1O1O1O1N2O1N2M3M3L8H5K5K5J6C=H8H8J7G:D=Ed0VOgdg3"}}, {"score": 0.6058015823364258, "bbox": [19.4, 23.3, 486.2, 359.3], "category": "car", "mask": {"size": [427, 640], "counts": "fo7Z9Q400000000000000000000001N100O1O2O0O1O101N100O101N10000O101N100O10001N10009G01O0000000000001O00000000000000001O00000000000000O10000000000O100000000O100000000O100000000O10000000000O100000000OH900O1000O01000000O10O100000O1000000O10000O1000000O10000O2O000O10000O10000O10000O1000000O10O100000O10000O1000000O100000O010000O2O00001N10001N10001O0O101O0O2O00001N101O0O2O001N1000001N1000001N1000001N10000O101O0O10000O2O000O101N100O2O0O2O0O2O001N101N101N1O2O0O2O0O2N2O0O2N1O2O0O2O0O2O0O2O0O2O1N2O0O2O1N2O1N101N2O0O2N101N1O2O0O2N2N2O0O2N2N2O1N2N2N3M3M2N3M3M3M2N3M3L4M2N3M2N2M3]LiGo1Z8mMjGP2X8mMkG[1NkMY8h0lGEGf0n7@UH_Ob0Diag4"}}, {"score": 0.2165617197751999, "bbox": [352.9, 5.1, 77.0, 179.3], "category": "person", "mask": {"size": [427, 640], "counts": "Udf4X1h;k0WO=D;I7H7I7K4L5L2M4K5K4L5L3N3M2M3N2M3O1O1N2O100O1O010O1K5000O10000O100O1O1O1O1N2O1N2N2L4M3L4K5K5K5M3L8E9F9G9J6I8H7F=G=_OZlj2"}}, {"score": 0.21267901360988617, "bbox": [324.6, 21.3, 159.0, 339.7], "category": "person", "mask": {"size": [427, 640], "counts": "gY^4l0Y<>B;F9G8J6J6J4J7DS>e0C9K5J5K5J6L4K5K4J7F9I7K6L3L5J4K6H7L5K4L5K5M2N3M3L4L4L4M3N2N2N1O2N2N2N200O001O100O1O1000O0100O0100O010O10O100J5100001O00000000000000001O000000O106J00001O001M2N3M2N3N1N3N1E5K7J5K4L4M2N3M2N101N2N2O1N2O0O2O1N2O1N2O001O0O2O001O001N101O00001O00001N10001O00001O001O00000O2O0000000000001O01O0000000000000O10001O000000000O2O00000000001N10001N10001N100O2O0O2N100O2N2O0O2N1O2N2N2N2N2N2N2M4M2M4L3M4K:E[Sf6"}}, {"score": 0.27915719151496887, "bbox": [587.6, 165.8, 52.4, 150.1], "category": "car", "mask": {"size": [480, 640], "counts": "cTg8e0n=e0@=G9G6K6I8J4K5M2M3L4M4K4M3M3N2M3M2N2N3N1N201N1O101N10000O10000000O101N2O0O101N1O2M4M1O`J"}}, {"score": 0.17030727863311768, "bbox": [137.5, 181.0, 53.8, 75.3], "category": "person", "mask": {"size": [480, 640], "counts": "[QV2a0[>:H5K5L4L3M3M2N2O1O1N2O001O001O0000000100O1O1O1O1N2O2N2M2N4L3L6H:B`Pf6"}}, {"score": 0.15209877490997314, "bbox": [183.0, 149.0, 283.9, 215.9], "category": "car", "mask": {"size": [480, 640], "counts": "^_a42m>4L4L5K:F?A3L4M3XC`Nk;b1PDeNm;]1mCkN0@a;Z3VDoLR;a3O1O1N2O001N101O0O101^EnKT:S4gERLX:[4O2O001N2O001O1N2O001O1O1O1O1O1O1O1O001O1O001O1O001O0O2O1O001O1O1O1O001O2N1O1O1O001O00001O00001O00001O00010O001O00001O000010O01O001O001O0001O00001O00010O00001O00001O00000O2O00000O2O00001N10001O0O2O1O0O2O1O1N2O1O1N2O001N2O1N2N2O1N1O2M3M3N2M5K5K6J4M4aL\\EZ2g:dM^EX2b:hM^EX2b:hM^EX2b:hM^EX2b:hM^EX2b:hM^EX2b:hM^EX2Xfa2"}}, {"score": 0.13292182981967926, "bbox": [319.2, 162.2, 147.7, 125.6], "category": "car", "mask": {"size": [480, 640], "counts": "jfT58e>:F;E7K8H8H4L4M8G2N2O1N2O1O1N3N1O1N2O1O1N101O1O1O1O0O2O001O001O1O0O2O001L301O00001O0000001O00001O001O00001O03NO001O010O00000010O0000010O000001O01O0000010O001O001O001O1O001O001O001O1O1O2N1N3N2N3M2M3N2M4M3L4L4L5J8H8G:D>WOjA2kSd2"}}, {"score": 0.12956960499286652, "bbox": [4.3, 140.2, 177.5, 140.3], "category": "car", "mask": {"size": [480, 640], "counts": "cP2g3Y;0000000000000O100000000000O100000000O1000000O1000000O100O10000O1O100O100O2O000O100O100O100O101N100O1000000O101O00000O101O0O101O0O101N101N10001N101O0O2O000O2O001N101N2O1N2O1O1N2O1N2O1O1O1O1O002N1O1O1O1O1N101O1O1N2O1O1O001N101O1O001O001N101O1N2O0O2O1O1O1N2O1O1O0O3N1O1O1N2O2N7G3M4L5LU[W7"}}, {"score": 0.09589684754610062, "bbox": [2.6, 148.3, 148.0, 77.0], "category": "car", "mask": {"size": [480, 640], "counts": "jR1n1R=0000000O2O00000O100000000000000000000000O100000O100000O2O000O100O100000000O1000000O100O2O00000O10000O101O0O10000O101O000O2O000O101O000O2O000O2O001N100O2O001O1O1O1N101O1O0O2O00001O0O4M1N2N1O2N2N2N3M1O2N2O2L[Tj7"}}, {"score": 0.09131783992052078, "bbox": [65.5, 164.2, 142.1, 96.6], "category": "car", "mask": {"size": [480, 640], "counts": "mkR15i>9H2N2L4N1O7J2N1N10O10O1000000O1000001O001O000000000000000001O1N10001O0O2O001O1N101N10000O2O000O7J0O2H`AMb>27O00Zf4KTRO1eWL2GNbA2[>2cA0Z>3eANX>4hAMW>3iAMW>4gAMZ>2fANZ><1O00010O1O1O000000001O00001O001O1O1N2Ifjh6"}}, {"score": 0.0856514424085617, "bbox": [136.9, 181.5, 40.4, 37.9], "category": "person", "mask": {"size": [480, 640], "counts": "RQV29f>3M2N2O0O2O00001O01O002N1O1N2O4Kiom6"}}, {"score": 0.08439887315034866, "bbox": [60.1, 218.7, 102.0, 60.9], "category": "suitcase", "mask": {"size": [480, 640], "counts": "e`Q1<^>:J4L3N3M1O2O1N101N101N100O100000001O0O1000000000000O10000000000000000000000000000O0100000000000000000000000000O101O00000000000O101O0O10001O0O101N101N1O2O1M3N2M4K9EcbS7"}}, {"score": 0.0838632583618164, "bbox": [0.0, 221.4, 93.9, 195.6], "category": "car", "mask": {"size": [480, 640], "counts": "W8\\4d:000N3N2N2O1O0000001O000O100O10O100O1O1N200O10000O1O2N100O1N2O2M2N2N2N3N1N3M2N2N2O1M3M3M2N3\\Od0I7YOf0N3N2O1N1O001O1N3N101O1O1O1N2N2N3J5M4N7Hm\\[8"}}, {"score": 0.07902027666568756, "bbox": [412.8, 218.8, 27.4, 31.7], "category": "person", "mask": {"size": [480, 640], "counts": "[^W61m>5M1OO2N2MggQ3"}}, {"score": 0.07822771370410919, "bbox": [433.6, 220.8, 21.8, 30.5], "category": "person", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.07487121224403381, "bbox": [463.0, 172.1, 71.6, 63.6], "category": "bench", "mask": {"size": [480, 640], "counts": "Vkn65e>f0C9H6K3N2M4M2N2O0O2N101N101O001N10000000000O1000000000000000000000000000000000000000000000O1000000000001O00000O10000O2O0O101O0O2O001N101N2N2O1M4M4K5L5H=YOcXe4"}}]}, {"image_id": 79966, "dets": [{"score": 0.9860239028930664, "bbox": [5.2, 234.4, 429.7, 403.1], "category": "bicycle", "mask": {"size": [640, 480], "counts": "h`3h6X=0000000000000000000O1N3N1O2N1O2N;E1O1O1O1O100O1O100O1O100O1O100O10000O100O1O100O1O1O1O100O1O1O100O1O100O100O100O100O100O1O100O100O1O100O1O100O100O10000000000000000000000000000000000000000000000O100O10000O100O10000O10000O10000O1000000O10000000000000000000000000000000000000000001O0000001O000000001O000F:O10001N10000O10001O000O1000000000000O10001O00000O100000000O2O00010O000010O0001O00010O001O001O01O01O001O0000001O00000010O00000001O00000000000010O0000000000000000000000000000000001O00000000000001O:F000001O000000000000001O00000000001O000000001O000000001O000000001O00000000001O000000001O0000001O000000001O00000000001O00001O00001O001O001O00001O00001O00001O00001O00001O001O001O001O001O001O001O001O001O001O001O001O001O1O001O1O001O1O001O1O001O1O1O1O001O1O1O1O1O1O1O1O1O1O1O1O1O2N1O1O1O2N1E;N3N2M2N2O2M2N2N2M3N3M2N3M3L4M3L4M3L5K5K5J6K4L5K5J7J20000000000000000000eXl0"}}, {"score": 0.6567422151565552, "bbox": [183.8, 164.6, 195.9, 270.7], "category": "dog", "mask": {"size": [640, 480], "counts": "[^n3>[c0a0C8H8I6J5K5L3M4M2M3M3M3N2M3M3N2N2M3N1O2M3N2N2M2O2N2N2M3N2N2N2M4M2N2N2N3L3N3M3M3F:M3M3M3M2N3M3M3M2N2N4L4L5K3M2N1O25JO2N1O2N101N2N1O100O2O0O1O1O101N10000O1000O10000O2O0O2N1O2O0O1O2N1O1O101N3M2N2N2N2N3M1O2N2N2N2O1N2O1N2N100O1O101N1O1O2N2O1N2N2N2N101M2O2N1O1O1O1O1O2M2O1O1O2N1O2N2N1O1O2N1N3M2N3M3M2M4M2N3L3N4K6K5I7J8F:F;E;Da0_O\\UX2"}}, {"score": 0.12645061314105988, "bbox": [143.1, 199.8, 166.8, 403.7], "category": "person", "mask": {"size": [640, 480], "counts": "bTd34jc06I6XNCd_Od0ja01N3M4J7B@k_i3"}}, {"score": 0.06935372203588486, "bbox": [31.9, 527.0, 150.7, 113.0], "category": "bicycle", "mask": {"size": [640, 480], "counts": "ie[1C9G6J5K6K5L4K4M3M4L3M3N1N2N101O1O001O1O000O1000001N1001O10O0100N2O1N2O1M3N101N3M3M2N3M4L6J6J3L4K5L3L?A8Ed0YOc0nN[YT3"}}, {"score": 0.06091064587235451, "bbox": [280.1, 273.7, 78.5, 181.8], "category": "handbag", "mask": {"size": [640, 480], "counts": "[jf5W1`b0a0@=H=C9F9J4L4K6K5K2O2M4M2M3N1O1N2O2N001O1N100O10001O1O01O001O1O1O1O2M2O1N2N3N2M1O1N2O1N5M4L3L3N1M4K5L;E4J7H?_O>XO^aa2"}}, {"score": 0.059182677417993546, "bbox": [283.5, 253.6, 115.4, 129.3], "category": "handbag", "mask": {"size": [640, 480], "counts": "PYS61kc08Z\\OLVc0c0N1O00000O2O1N2N1O2O1N1O2N101N2O1N1]]OjNZb0_1O001O2N2N1O1O2N2N1O2N1O10O01O100O00001O00001O001O1O1O1O0001O001O001O1O00100O1O0O2O1O1N2N2O1N2N3M2N3M2N3L5K5J;E7@Pbh1"}}, {"score": 0.05876724049448967, "bbox": [196.7, 113.0, 256.8, 527.0], "category": "bicycle", "mask": {"size": [640, 480], "counts": "]Vj4a0ZAJW>j0[A_OU>e1n@cNi>_3L5J5J5K3M4L3M4L3K6DN101O0O3O0O2N1=DN1O1O1O1O1O1O1O1O1O2N1O2N2N3M2N3M4L2@a0M1N3M2N2M4M2M4L4L4L4L5K5J5K6K4L4L5K5J6K6H9G8H7G8I7J5L4L6I8G=B?A>cKdA_2c`0YO?@>Bb0_OXin0"}}, {"score": 0.058172035962343216, "bbox": [351.2, 213.4, 126.3, 152.2], "category": "bicycle", "mask": {"size": [640, 480], "counts": "lTg74kc05K5L=C0O2O1O1N2O0O2N3M101O1N11O0010O01O010O0000000000001O01O000001O00000000010ON2O2O1N10001O00O2E:N3M3M3N2N101N:F3M3M3Mef>"}}, {"score": 0.05350148305296898, "bbox": [10.3, 455.0, 141.2, 182.7], "category": "bicycle", "mask": {"size": [640, 480], "counts": "i`:;cc06J4J6E;M1O2N2O2M2O2N3L1O1O010O100O1000000000000000000001O00001O00O1000000O10O2N2N2N3L3O1N2N2N1O2N1O2N1O1O2N5K4M3L3N2L4L3N1:k\\ODQb0a0h]ODUb0>h]OEWb0=d]OGZb0Q1N1O2M3O0h]OcNla0l1M2N2N1O3M3M2M2O1O1O1O1N4M2N1O1O1O001O0010O010O01O10O1O001O001O00001N101N101O0O3N1N2N3M2N2N2M3N4K7I6I6G:Eb0^O=Eleb6"}}, {"score": 0.051157448440790176, "bbox": [192.5, 378.4, 173.5, 89.0], "category": "handbag", "mask": {"size": [640, 480], "counts": "TQX41nc03Z\\O0Wc06c\\OMZc0>N1O2M2O2M2O2N1O2N1O1O2O0O101O0O101O001N2O001N100O2N100O101O0000000000000000001O00000000001O00001O0O1000001O0000O10000O101O000O11O0000001O0000000000000001O000000001O001O0001O0001O000O2O1O001O0000001O000O2O001O001O00001N2N2N2O001N2O001N1O2N2N1O2L4N102L5Ilfb2"}}, {"score": 0.04883522167801857, "bbox": [17.2, 358.6, 106.8, 147.7], "category": "bicycle", "mask": {"size": [640, 480], "counts": "aU>e0Vc09I9G7J5K3M4K4M3M2O2M2O2N100O2N1O1O1O10O001O1O001O1000O1000000001O00000O1O1O001O2O00000000001N100O100O2O0O2O1N3Am]OmNUb0_10O01000Fm]OfNSb0Y1o]OdNTb0Z1;N3M2N2M200O1O2OQ]OVOgb0Q13L6D?F8GSf]7"}}]}, {"image_id": 98194, "dets": [{"score": 0.97929847240448, "bbox": [482.1, 203.1, 99.6, 142.3], "category": "cat", "mask": {"size": [427, 640], "counts": "on\\6f0]U;8E9K5L3N2N2N2M2N3N2O0O100O2N1O1O2N100010O0001O00000000000001O001O101N1O001O001O000010O0001O00010O0000000001O00000000001N100000000O2O00001O000O10001N100O101N1O1O2O0O1O2N100O2O0O101N101O000O2O001O0O101O00001O00000000000000000001O000000001N101O001N10001N2O1N3M2N3N1N2N100000PK"}}, {"score": 0.9878488779067993, "bbox": [391.2, 80.7, 65.8, 130.1], "category": "person", "mask": {"size": [375, 500], "counts": "Ycb49Z;;F6K5J5M3L3N6I6I6K5L3I6M4M3M2N100O10001O1O1O1O1O00O2N1012N1O02M3L4L3N3L2O2M2L4L5J5K6L4L4DU;DVWl4"}}, {"score": 0.7932952046394348, "bbox": [6.3, 24.8, 242.8, 122.6], "category": "bus", "mask": {"size": [375, 500], "counts": "lU5:Y;;EB>B7J5J4L6K;D`0@9G4L4K5L4M3L4M1O0010O100010O01O1N2O3M3L4M4K5J5I6J7K5I7G9D?\\Oe0lNQ1YO\\Tl2"}}, {"score": 0.4870213568210602, "bbox": [60.2, 115.2, 81.6, 58.8], "category": "car", "mask": {"size": [375, 500], "counts": "\\eh0a0S;6K5K3N2N2N2N2N100O2O001N1000001O0O10001O00000000000O101O0000000001O000000000000000001O01N100000001O00001O00001O1O001N2O1O1N2O1N2N3M3M3L6IbiT4"}}, {"score": 0.447935551404953, "bbox": [308.6, 98.3, 143.3, 67.3], "category": "car", "mask": {"size": [375, 500], "counts": "lVd32_;8M6I3N2M3N2N2M3N1O2N1O100O101O0O100O1000000O1000000O10000000000O10000000000000000000000O1000000000O100000000O1000000O100000000000000000010O0001O1OO100O2O00000000001O00000000001O000O2O001N2O1O1N3N1O1N2O4L1N1O2O1FkDOY;LjD3^;OPoj0"}}, {"score": 0.3413696885108948, "bbox": [170.0, 91.9, 55.0, 172.5], "category": "person", "mask": {"size": [375, 500], "counts": "SoP2n0`:b0B]OhGcM]8o1m0I9H9G:Ff0[O7FT^Z3"}}, {"score": 0.32846277952194214, "bbox": [133.7, 154.8, 70.4, 113.7], "category": "motorcycle", "mask": {"size": [375, 500], "counts": "djc13X;o0ZO7I7J6J4L4L4M3M3N3L2O1O1N2O1O010O001O1O000010O1O1O1O2N2N1O2N1O2N2M2O2N2M2O2M2N4M1N8I2L4J7Bb0Ciea3"}}, {"score": 0.28892782330513, "bbox": [122.0, 108.6, 58.1, 48.3], "category": "car", "mask": {"size": [375, 500], "counts": "ko^1`0T;6K4M2N2N1O2O000O2O00000O100000001O0000000000000000000000000000O2O000000001O0O101O1N2O1N3L5J:CPaf3"}}]}, {"image_id": 145544, "dets": [{"score": 0.9024617075920105, "bbox": [164.9, 0.0, 380.6, 420.4], "category": "toilet", "mask": {"size": [426, 640], "counts": "[cW3a0XB8H5K8H9G1O2N1O2N1O1O2N1O1O1O1O1O1O1O1O1O1O002N1O1O1O1O1O1O1O1O1O001O1O001O001O1O001O1O1O001O1O;E0000000000000000E;0000O10000O10000O10000O10000O100O100O100O10000O10000O10000O10000O100O100O1O100O1O1O100O1O1O1N2O1O1O1N2O1N2O1N2N2N2N2N2M3L4M3M3M3M3YOg0N2N2N2O1N2O1N2O1N2O1N2O1O1K5J6I7J6M3L4L4L4L4L4L4M3L4L4M3N2N2M3N2N2M3M3XOWJmIk5P6i0N2N2N2O1N2O1N2O1N2N2M3M3N2M3N2N2O1N2O1N2O1O1N2O1O1N2O1N2O1O1N2O1N2O1N2O1N2O1N2O1N2N2O1N2N2N2N2N2N2N2N2N2N2M3N2N2N2M3N2N2N2N2N2O1N2N2N2N2M3L`0@8H9F:AVU^1"}}, {"score": 0.5214170813560486, "bbox": [321.9, 148.8, 312.6, 277.2], "category": "cat", "mask": {"size": [426, 640], "counts": "YPf44S=5J5L5L3M4M3L4M3L4L4L4M2N3M2N2N2N2N3M3N1N2N2O1N2O1N101N101N101N2O1N101N2O1N101O1N101O1O0O2O1O0O2O001N101O001N2O001N101O001N101O001O1O0O2O001O1O001O1N101O1O1O1N2O1N2O1N2O2N1N2O1N2O2N1N2O1O1O1N2O1N2O1O1N101O1N2O1O1N2O002M2O1N2O1O0O2O1N2O0O2O1O0O2O1N101O2M2O1N3N1N2O1N3N1N2O2N1N2O1N2O1N2O001N2O1N101O1N2O001N2O001N101O1N101O1O0O2O1O001N101O1O0O2O001O001O0O101O001O001O0O2O001O1O0000001O000O101O000000001O0000000O2O0000000000000000000O10000000000000000000000000O1000000000000000000001O0000000000007I0000000000I7000000O1000000000000O100000000000000000nZ2"}}, {"score": 0.2799168527126312, "bbox": [272.5, 257.8, 196.0, 123.8], "category": "toilet", "mask": {"size": [426, 640], "counts": "W\\d31P=?]Ob0E9J6K4M3L3N2N2N2N3N1N2N2O1N2O1O1O1O1N101O1O1O1O001O1O001O1O001O1O001O010O00001O000010O00010O000010O01O001O01O01O001O0010O01O001O00001O0010O0001O001O0000001O0000010O0000001O0000001O00000000000000000000000O10000010O000000000000000000O1000000000O100000O10000000000000000O2O0000000000000O101O000O100O2O000O100O2O0O101O0O101N100O2N101N2N2N1O2N2N2N2N3L4M2M4K6J7IWbZ2"}}, {"score": 0.27277666330337524, "bbox": [283.2, 140.7, 198.1, 249.5], "category": "toilet", "mask": {"size": [426, 640], "counts": "dgk3Y2m:c0_O8H9G;E5K4L5L4L4L4L3M2O2M3M3N2M3M3N2M2O2N2M3N2N1N4M2N2M5L4L3M3M2M3N2N2N2N2M4M2N100O1O100O7JO00001O001O001O001O001O1O001O1O1O1O1O001O1O1O2N3M2N2N2N2N2N3M2N3M3M2N3M2N3M=C9G9G7I3M2N2N1O2N1O3M5K7I6J6J2G8M3N2M3M4L3M3M4L5L5I7J6J8FCfS="}}, {"score": 0.0623580664396286, "bbox": [360.9, 130.7, 20.4, 35.9], "category": "bottle", "mask": {"size": [426, 640], "counts": "bog4?h<5L3N1O2N1001N2O1O3K6IgX]3"}}, {"score": 0.05497661232948303, "bbox": [333.9, 0.0, 230.4, 155.1], "category": "toilet", "mask": {"size": [426, 640], "counts": "o^]4>f<8L5J5M3L4M2N3M2N2J6O001O001O001O001O00001O001O001O001O00001O001O0000001O0000001O00001O00001O00001O00001O001O001O001O1O1O001O001O001O001O001O001O2N2N1O2N1O1O3M4dEWMo9U3M4L4L3M2N2N1O2N1O1O2N1O1O2N1O001O1O001O001O0000001O00000000001O000000001O0000000000001O0000000000000000000000000000000000000000000000000000O5L001O0O101N10001N100O100O100O2O000O101N1O2N2N1N3N2N2N2N2N1O2N2O1M3N2N2N2N2N2M3N3K4J6M4K4SOlD_OT<@SD:c4K5L4M@TB=Z>M2N2N2N1OSna1"}}, {"score": 0.498581200838089, "bbox": [157.1, 242.3, 105.8, 135.0], "category": "suitcase", "mask": {"size": [480, 640], "counts": "bb^2f0k=n0[O1N3L5J7G6N1N3N1O100O2N1O100O100O2O00001N10001O001N10001O001O001O001O001O001N10001O00001O01O0000000000001O0000001O001O0000001O01O0001O00000000000000O10000O100O101N100O10001O0O101O001N2O2N1O00001N10000000000000O10O10000000O100O10000O0100000O10000000O1000000000000000000001O0000000000000000000000001O00000000000000000000O1000000O10000000000O1000O100000O1000000000000O1000010O1O1O010O000010O0000001O00001O00001O01O01O00001O00001OO1000000000001O00000O100000000000000O1000001O0000000O10000000O1000000O101O000O100O10000O1000000O1000001O0O100000O100O2O0O1000001O0O101O00001O001O1O001N101O00001O00001O00000000O100000000000000O1000001O00000000000O1000000000000000000O100000000000000O10000000000O1000000O10000O1000000O11O000000000000000001O00000000000000001O000000001O00001O1O1O5K7I0O2O00001O000O2O00001N10001O0O101N1_OSBOm=0VBMl=1WBMi=2ZBKg=4[BJg=4\\BJd=5^BId=6c0O2N2N\\_R1"}}, {"score": 0.23101481795310974, "bbox": [231.0, 130.2, 220.6, 231.0], "category": "person", "mask": {"size": [480, 640], "counts": "QUP41h>:I8K5L3L3M3N1N2O1N2N2N2N2N2M3M3N2N2O001O0O2O1O001N2O1O001OO100O2N1O101N10001N100000O2O0000001N10001O1OM4M3L4M3M3B?N2M4L3M3M2N2N3M2N4LSo[4"}}, {"score": 0.21316315233707428, "bbox": [22.2, 138.8, 247.3, 178.7], "category": "bench", "mask": {"size": [480, 640], "counts": "b_:f0Z>000000000N4J9H6L5K6K4K4L3M3M3M3M4L4L2O0O2O0O2O0O2O000O2O0O2O000O100O101O0O10000O1000000000000O100000000000000O1000000000001O000000001N10O10000000O1000O1000O100000O10000O10001N1000000O1000000O10000O10000O1000000O10000000000000000O1000000O2N100O1O100O1O2N100O1000001O000000001O000O101N102N1N2O1O1N2O1O001O2N2N3L4L2N3D;N2M3N2N2M`dd6"}}, {"score": 0.1879701018333435, "bbox": [155.9, 225.8, 92.1, 57.6], "category": "suitcase", "mask": {"size": [480, 640], "counts": "dS]2?^>7J5K4N1N2N2O1N101N2O0O2O00001N100000001O000000000O10000000001O0000000000000000000000000000O2O00000000000000001O000000000O2O001O0O101N101N2O0O2N2N2N4K5J7G[Zj5"}}, {"score": 0.18139855563640594, "bbox": [79.8, 184.4, 176.2, 191.9], "category": "bench", "mask": {"size": [480, 640], "counts": "R]Z1h0U>7I6K3N3L3N2N2N3M3M3M2N2N2N2N101N2N1O2N101O1N2O001N2O001O1N2O1N2O1O1N2O0O2O1O1O002N1O1N2O2N1O0000001O0000000O20N1O100O10000O10000001O1O100O1O1O1O10O001O000010O0010O0N3L301N2O0O2O1O1O1O1O1O1O1O1O1O2O1M3M4L3M3M4L3M3M3L3M5K6^Ob0SOUB:^>ImS\\6"}}, {"score": 0.1567327380180359, "bbox": [376.5, 213.8, 210.4, 125.7], "category": "bench", "mask": {"size": [480, 640], "counts": "foe66i>3L3M201N2N3N2M3N1O2M3N4L2M2O2M2O1O2M2O001O001N101O00001N10001O00001O00001O000O101O00001O001O0O2O00001O00000000O100000000000O10000000O10000000000O2O00000000001O0O100000001N10000O10000O101N1O101N1O1O2N1O2N1O2N2M2N3M3M3M4K6G:IYQQ1"}}, {"score": 0.14814870059490204, "bbox": [185.9, 279.2, 70.5, 86.9], "category": "suitcase", "mask": {"size": [480, 640], "counts": "ehj2o0k=;H6K4L4M3M2M3O0O2N1O2O001N10000O2O000000000O10000000000000000001O000000001O000O2O000O2O001N2N2O1N2N3M2M5K4K9F`0YOZ`f5"}}, {"score": 0.13428199291229248, "bbox": [67.4, 162.1, 390.2, 129.2], "category": "bench", "mask": {"size": [480, 640], "counts": "UfU11n>2N2N2_AJT>:gAIX>a0N1O2N1O2N1O2H8N101O1N101O001O001O0O2O001O00001O0O2O001O00001N10000000000000001N100000000000000000000O1000001O0000001O000000001O000000001O00000000O100000000000000O100000000O10000O10000000000O101O00000O100000000O101O00000O10001O00000000001O000000001O0000001O0O1000000000001N1000000O2O001N101N2O0O2N2N2O0O2N1O2N1O2N2N1O2O1N2N6K3L3Mgcf5"}}, {"score": 0.10888918489217758, "bbox": [319.1, 292.3, 70.0, 53.2], "category": "backpack", "mask": {"size": [480, 640], "counts": "nQj4;a>7K4L3N2M3N2O0O2N1O100O101O0O10000O10001O000O1000000000000000000000O10001O0000001O0O101O0O2O1N2O1N2N2N3L5K9BWUh3"}}, {"score": 0.10317633301019669, "bbox": [158.8, 211.6, 407.2, 155.0], "category": "bench", "mask": {"size": [480, 640], "counts": "PP\\9"}}, {"score": 0.09948824346065521, "bbox": [320.3, 153.6, 224.9, 93.9], "category": "person", "mask": {"size": [480, 640], "counts": "cao4:_>8M4M3M2N2N2O1N101N101O0O2O001O001N10001O000O2O0000001N10001O0000001O0O10001O0000001O0000001O0000001O00000O101O000000000000001O01O0000000000000001O00000000000000000000000001O0001O0001O0001O01O000000001O01O0001O00001O00001O00001O001O00001O001N101O1O001O1O1O1O1O1O1O1L5L3N2O1N3N1N2O1N2O1M2M6K4Kn`W2"}}, {"score": 0.0935162827372551, "bbox": [287.3, 143.5, 175.8, 144.6], "category": "person", "mask": {"size": [480, 640], "counts": "\\c_48d>8I4M4L3N3N1N3M3N1N3N0O2O1O001O1O001O1O001O1N2O1O1O0O2O1O001O001O001O1O001O01O01O010O1O010O000001O0000001O000O101O001O001O1O1N2O1O0O2N2N101M4M2N3L3N4ARBDR>7iYh3"}}, {"score": 0.09293948113918304, "bbox": [49.5, 179.0, 127.4, 115.6], "category": "bench", "mask": {"size": [480, 640], "counts": "gZl0=X>a0H5L8G6K3M3M2N2O1N101N3N3L3N1N2O001N100O100O1000001N101O00000O100000000O010000O10000O0100000O1000000O100O10O11N100000000000000O100O10001N1000001N10000O2N1O1O1O101N1O1O2N1O1O2N1O1O2N1O2N1O2N2M3N2M4L4J6L5K9E`mn6"}}]}, {"image_id": 153692, "dets": [{"score": 0.9953930377960205, "bbox": [237.0, 255.9, 164.6, 119.3], "category": "bicycle", "mask": {"size": [425, 640], "counts": "nWW3a0d<6K4M3M4L4L4M2M3N2N1O1O2N1O1O1O100O100O010O01O010O01000O100O1000O0101N1O100O2N1O100O1O1O1O2M2O1O1O1O2N100O2O0O2O0O2O001O0O2O000O101O00000000000000000000O10000O100O2O001O001O1O1O1O1O1N2O2M2O1N2N2N2N2N2O1O1O1O1O2N1O0010O01O010O1O010O1O00001O01O01O0001O1O1O1O001O001O0O2O2N1N2O1N2N2N3L4K7J7G9ERiU3"}}, {"score": 0.9942272901535034, "bbox": [284.9, 188.5, 93.9, 166.6], "category": "person", "mask": {"size": [425, 640], "counts": "Zjn36Q=6I5K4M4L3M3M3N2M3N4K5L4K5J5L3L5K4M4L4M3M2N2N1O2O0O101O0000001N11O00001O00001O1O001O1N4M2M4M2M3K4K6K4N3L4L3M4L4M2N2M4L4L4L6I8GiW`3"}}, {"score": 0.228755384683609, "bbox": [351.1, 111.4, 16.5, 35.7], "category": "traffic light", "mask": {"size": [425, 640], "counts": "`Qc47l<>F4L3N100001O1N3L8CePb3"}}, {"score": 0.2218824326992035, "bbox": [262.9, 100.6, 19.4, 38.8], "category": "traffic light", "mask": {"size": [425, 640], "counts": "Y`^3c0c<7J3N2N10000001O001N3M4IVge4"}}, {"score": 0.21566087007522583, "bbox": [327.3, 295.4, 77.5, 69.1], "category": "bicycle", "mask": {"size": [425, 640], "counts": "SX^4=i<7J5L2N3M3N2N1N2O1O001N10001O00001O000000001O000000001O000000000010O0000001O0000001O0O2O1O1N2O2M2M4L5K7Hi[U3"}}, {"score": 0.20337550342082977, "bbox": [187.5, 92.1, 19.5, 39.8], "category": "traffic light", "mask": {"size": [425, 640], "counts": "T\\_26l<`0E4L3N2O0O11O001N3M4IjXe5"}}, {"score": 0.16149717569351196, "bbox": [83.4, 87.6, 18.1, 33.1], "category": "traffic light", "mask": {"size": [425, 640], "counts": "jcT1=i<7J3N101OO2O1N3LiXQ7"}}, {"score": 0.1509052962064743, "bbox": [102.3, 248.3, 41.6, 21.0], "category": "car", "mask": {"size": [425, 640], "counts": "_R]12U=4L3N101N10000O1000001O000O10000001O000O1000001O000O2O1N2OVS`6"}}, {"score": 0.14925630390644073, "bbox": [46.7, 251.9, 37.4, 21.1], "category": "car", "mask": {"size": [425, 640], "counts": "ije05S=2M3O1N1O1000001N100000000000000000O101O001O0O2O1N2MgbX7"}}, {"score": 0.1436997354030609, "bbox": [427.4, 123.9, 13.3, 25.4], "category": "traffic light", "mask": {"size": [425, 640], "counts": "[Pc5B?_O`0Bc0\\Od0]Oc0TOo0YO]d_2"}}, {"score": 0.8129173517227173, "bbox": [218.5, 67.8, 129.4, 290.2], "category": "refrigerator", "mask": {"size": [375, 500], "counts": "^Qg2i0T:YO\\F]1_8g1nNj0F8E9_Oa0G9I8H9F8E:ZOe0I8L4L3M4L3N3N2N1O2N1O2O0O2N1O2N1O2O001N100O2O0O9H0O1O100H80000000000000000000000000000000000000000000000000000000000000000000000000000000000000H8O1N2O1O9F3M3N1N3M3K5K5J6jNV1ZOf0C>E;D`0\\Od0[O[1WN_hi1"}}, {"score": 0.5368696451187134, "bbox": [145.7, 321.5, 156.4, 53.5], "category": "dog", "mask": {"size": [375, 500], "counts": "a_i14^;6I6M4M2K6N1N3N1O100O2N2N1O100O100O1000000O1000000O1000000000000000000O1000000000000000000000000000000000000000000001O000000000000001O00000000001O0000001O00000000001O0000001O00001O00001O00001O0000001O00001O001O001O00001O001O001O1O1O001O00001O1O1O1O1O001O1M3O1O1N2O1N3N2KVj[2"}}, {"score": 0.21352948248386383, "bbox": [219.0, 6.8, 267.4, 359.9], "category": "refrigerator", "mask": {"size": [375, 500], "counts": "VgP3`0n:g0^Od0_O;kKkMSN\\2a1lM\\NY2Z1oMcNU2T1TNgNQ2Q1XNXLPOh1l2e1aNSLSOR2`21_MdNR2W1c0GjMkNj1X1a0]OVNSOa1[1=oNfN^OS1_1;jNkNBn0b1;dN]2X1T4M3E:O2N1O1O1O1O1O1O:F0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000G9N2O1N2O1O1O1O1O1O1O1O1O100O1O100O100O1N2N2N2M3N2M300000000000lS5"}}, {"score": 0.1988125592470169, "bbox": [3.2, 136.9, 32.6, 41.0], "category": "clock", "mask": {"size": [375, 500], "counts": "W[21];a0E6L2M4M2O0O2O00001N10000001O0O10001O1N2N2N2N3L6IhW[5"}}, {"score": 0.18266721069812775, "bbox": [87.8, 84.8, 106.2, 267.6], "category": "refrigerator", "mask": {"size": [375, 500], "counts": "jhT14;a0V:o0@8I4K7I8G8D=B01L300J72N100OM30011N1000O0100N2O1O3M3M2K6Jfgh3"}}, {"score": 0.11967019736766815, "bbox": [127.3, 133.7, 67.4, 46.3], "category": "dining table", "mask": {"size": [375, 500], "counts": "UWa15\\;;I4L4M2N2O0O2O1N10001N1000000O10000000000000000000000000000000000000O100000001N10000O101N100O2O1N1O2N3K6JfSb3"}}, {"score": 0.11463303118944168, "bbox": [132.6, 89.9, 66.6, 115.1], "category": "refrigerator", "mask": {"size": [375, 500], "counts": "Wfb13_;f0[O9H3N3K4M4L3M3N2N1L5M210C[NgFe1X9>ON2N1N4N1M3BcFcN]9\\1kF^NT9b1`0I7O2M2O2N101N2O1N2O10OO2N3N10000N2O2N1O1O2M2O3L4M2KVUb3"}}, {"score": 0.11355728656053543, "bbox": [0.0, 200.7, 78.3, 149.3], "category": "oven", "mask": {"size": [375, 500], "counts": "^7U2b90K9I;E7K2N3M2O1N3M10000000000O2O0OO2N1O101N3N1N2O1O1O100O0O1N2O1O2O1O001N1H9L3O2L4K6N1N3O0000001O1N2O1O1N2O1O1N2M3M4J6L3O2O103YO^EOi\\o4"}}, {"score": 0.10060609132051468, "bbox": [78.3, 48.6, 125.1, 189.0], "category": "refrigerator", "mask": {"size": [375, 500], "counts": "Umn06V;h0fN[1fNP1I6K;E5J3N3M3O1N2O1N1N3N2N2O1O2N1O1O1O000O100000001N100O2O0O101O0001O01O00001O001N101O0O1000000000000O100O01000O10000O0100000000O001O1O1O100O1O2N1O1kMdImN^6P1iIhN[6g0XIQNe0o0X6h0aJnNf5m0cJdNg5X1^2N2N2O2M2N2N3M2O2M2O2N1O1M4L3L6K5L5K4Lamb3"}}, {"score": 0.0880332812666893, "bbox": [282.8, 32.3, 28.9, 46.7], "category": "bottle", "mask": {"size": [375, 500], "counts": "\\iX36Y;?E6L3M2O2N1O100O1000000001O0O3N2M2N6H=AQdV2"}}, {"score": 0.08756314963102341, "bbox": [250.5, 114.3, 60.4, 53.2], "category": "book", "mask": {"size": [375, 500], "counts": "lWn2e0o:5L4L3N2N1O2O0O1O2O000O1000001O000O100000000000000000000000000001N100000001N101O1N101N2O2M4J9GPYW2"}}, {"score": 0.0858195498585701, "bbox": [42.4, 69.6, 219.1, 207.3], "category": "refrigerator", "mask": {"size": [375, 500], "counts": "Yga08Y;K3M2N2N1O2O0O10000O101O00000000000O100000000000000000000O100000000000000000000000O10000000001O0O10000O2O000O2N1O2N2L4J7Gnim1"}}, {"score": 0.08090058714151382, "bbox": [253.3, 90.5, 16.1, 33.0], "category": "person", "mask": {"size": [375, 500], "counts": "Ynn2i:S1YO8AB7H7^Oe0B[\\Q4"}}, {"score": 0.0696650892496109, "bbox": [219.8, 88.4, 27.7, 44.5], "category": "person", "mask": {"size": [375, 500], "counts": "dnc24b;5K3N100000N5KlVP3"}}, {"score": 0.0691106840968132, "bbox": [113.7, 70.9, 79.4, 94.0], "category": "refrigerator", "mask": {"size": [375, 500], "counts": "gmZ1P1d:n0TO4L4L2N2N3N001N2O00001O000000001N10O10000000000O1000000O10000O100O1O100O1O1O1O1N2N2N2N2J6N2M3N2N2N2N2O1O100N2O1O1O1O1O1O1O2N100O1O2M3M2O3L4K7E\\^a3"}}, {"score": 0.06785347312688828, "bbox": [326.9, 11.6, 19.2, 71.9], "category": "bottle", "mask": {"size": [375, 500], "counts": "mci3`0l:e0B8I5L21O0N4N5DcVj1"}}]}, {"image_id": 155997, "dets": [{"score": 0.9750598073005676, "bbox": [287.2, 133.6, 78.5, 158.2], "category": "person", "mask": {"size": [480, 640], "counts": "eZ\\4e0V>:G:F9H7J5H9G8K5K4M3L5L5K4M1N2O2M2O1O2M2O100O1O10O10O11O1O1O2N2O2M3M2M3N1O1O1O001N2O1O1N3N0O2O1N2N3L3N4L4K4L6J5I9G>_OinS4"}}, {"score": 0.965568482875824, "bbox": [231.9, 261.4, 197.2, 115.6], "category": "dog", "mask": {"size": [480, 640], "counts": "SXa3=]>>F7I6J5L3M3M2O2M2O1N3N1O1N2O001O1O0O2O001O1O001O001O001O000010O000001O00010O00001O01O0001O010O00010O00010O00010O01O1O010O000010O01O010O001O010O00001O010O00001O01O01O00001O01O01O000010O000000010O00000000001O01O0000000001O0000000000010O0000000000001O000001O000001O000001O00000001O01O000000001O0000001O0000001O00001O001O001O001O001O0O2O001O1N2O001N2O1O1N2O1N3M2O2M3M2N10000000jbR3"}}, {"score": 0.8848884701728821, "bbox": [50.2, 107.0, 111.0, 352.4], "category": "person", "mask": {"size": [480, 640], "counts": "kRP1>^>h0YO9G5K5K5L5J5L8G3N2N4M4N2O1N2O1O1O100O010O100000000001N1O2N1O2N2M3M4K4L4L5J7H;G9G7I6J5J7I6J6J7J5K5J6I8H7K5J6K6I7I7J6I7J6I8Fb0YOc_U7"}}, {"score": 0.8547130823135376, "bbox": [60.3, 260.8, 165.9, 219.2], "category": "bicycle", "mask": {"size": [480, 640], "counts": "lP[1<]>P1SOe0ZO9H7I6J5N1N2N2N3M2O1N2O2M2N2N1O2O1N2O1O1O1O001N2O1O001N2O1O010O00010O010O001O1O10O01O1O1O1O00100O1O1O001O1O1O1O1O1O1O1N2O1O100000000010O000000O0100O10O01O1O100O2N1O2M2M3L5L3L4L5N1N3M3M2M4L3N4K4M4J6J6G9E:J6J7G8I2N2N3M3N2M3N1N2O1N3M2O1N2N2O1O001N10001N2O0O2O1N101O0O2O1O001N2O1O001O1O1N101O001O001O0O110O001O001O001O000001O000000001O01O01O00001O00001O001O0O2O001N2O1N2O1N2O1N2N2N3M3M3M3L20000000anm2"}}, {"score": 0.6333994269371033, "bbox": [158.8, 152.2, 74.1, 131.6], "category": "person", "mask": {"size": [480, 640], "counts": "WYa2d0V>