Repository: robot-learning-freiburg/CL-SLAM Branch: main Commit: 8150d452eedc Files: 43 Total size: 280.9 KB Directory structure: gitextract_5mhk57f5/ ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── config/ │ ├── config_adapt.yaml │ ├── config_parser.py │ └── config_pretrain.yaml ├── datasets/ │ ├── __init__.py │ ├── cityscapes.py │ ├── config.py │ ├── kitti.py │ ├── robotcar.py │ └── utils.py ├── depth_pose_prediction/ │ ├── __init__.py │ ├── config.py │ ├── depth_pose_prediction.py │ ├── networks/ │ │ ├── __init__.py │ │ ├── depth_decoder.py │ │ ├── layers.py │ │ ├── pose_decoder.py │ │ └── resnet_encoder.py │ ├── pytorch3d.py │ └── utils.py ├── loop_closure_detection/ │ ├── __init__.py │ ├── config.py │ ├── encoder.py │ ├── loop_closure_detection.py │ └── utils.py ├── main_adapt.py ├── main_pretrain.py ├── make_cityscapes_buffer.py ├── pyproject.toml ├── requirements.txt ├── slam/ │ ├── __init__.py │ ├── config.py │ ├── meshlab.py │ ├── pose_graph_optimization.py │ ├── replay_buffer.py │ ├── slam.py │ ├── transform.py │ └── utils.py └── third_party/ └── fix_g2opy.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Custom log*/ figures/ data/ iros_paper/ *.pkl *.pt _.pylintrc sync_gpu.sh # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # IDEs .idea ================================================ FILE: .gitmodules ================================================ [submodule "third_party/g2opy"] path = third_party/g2opy url = https://github.com/uoip/g2opy.git ================================================ FILE: .pre-commit-config.yaml ================================================ minimum_pre_commit_version: 2.9.3 default_language_version: # force all unspecified python hooks to run python3 python: python3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 hooks: - id: check-added-large-files name: Check large files (>500kB) args: ["--maxkb=500"] - id: trailing-whitespace name: Trim trailing whitespaces - id: end-of-file-fixer name: Add empty line to end of file - id: check-merge-conflict name: Check unresolved merge conflicts - id: check-json name: Check JSON - id: check-yaml name: Check YAML - id: check-toml name: Check TOML - id: check-xml name: Check XML - repo: https://github.com/pycqa/isort rev: 5.10.0 hooks: - id: isort name: Reorder python imports - repo: local hooks: - id: yapf name: Run yapf formatter entry: yapf language: system types: [python] - repo: local hooks: - id: pylint name: Run pylint analysis entry: pylint language: system types: [python] ================================================ FILE: LICENSE ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: README.md ================================================ # Continual SLAM [**Website**](http://continual-slam.cs.uni-freiburg.de/) This repository is the official implementation of the papers **Continual SLAM** and **CoVIO**: [**arXiv**](https://arxiv.org/abs/2203.01578) | [**Springer**](https://link.springer.com/chapter/10.1007/978-3-031-25555-7_3) | [**Video**](https://youtu.be/ASEzwnV4vNk) > **Continual SLAM: Beyond Lifelong Simultaneous Localization and Mapping through Continual Learning**
> [Niclas Vödisch](https://vniclas.github.io/), [Daniele Cattaneo](https://rl.uni-freiburg.de/people/cattaneo), [Wolfram Burgard](http://www2.informatik.uni-freiburg.de/~burgard/), and [Abhinav Valada](https://rl.uni-freiburg.de/people/valada).
> *International Symposium on Robotics Research (ISRR)*, 2022 [**arXiv**](https://arxiv.org/abs/2303.10149) | [**IEEE Xplore**](https://ieeexplore.ieee.org/document/10209029) > **CoVIO: Online Continual Learning for Visual-Inertial Odometry**
> [Niclas Vödisch](https://vniclas.github.io/), [Daniele Cattaneo](https://rl.uni-freiburg.de/people/cattaneo), [Wolfram Burgard](http://www2.informatik.uni-freiburg.de/~burgard/), and [Abhinav Valada](https://rl.uni-freiburg.de/people/valada).
> *IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops*, 2023

Continual SLAM teaser

If you find our work useful, please consider citing our papers: ``` @InProceedings{voedisch2023clslam, author="V{\"o}disch, Niclas and Cattaneo, Daniele and Burgard, Wolfram and Valada, Abhinav", editor="Billard, Aude and Asfour, Tamim and Khatib, Oussama", title="Continual SLAM: Beyond Lifelong Simultaneous Localization and Mapping Through Continual Learning", booktitle="Robotics Research", year="2023", publisher="Springer Nature Switzerland", address="Cham", pages="19--35", } ``` ``` @article{voedisch2023covio, title="CoVIO: Online Continual Learning for Visual-Inertial Odometry", author="V{\"o}disch, Niclas and Cattaneo, Daniele and Burgard, Wolfram and Valada, Abhinav", journal="IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops", year="2023" } ``` drawing
CL-SLAM and CoVIO are also featured in the [OpenDR toolkit](https://github.com/opendr-eu/opendr). ## 📔 Abstract ### Continual SLAM While lifelong SLAM addresses the capability of a robot to adapt to changes within a single environment over time, in this paper we introduce the task of continual SLAM. Here, a robot is deployed sequentially in a variety of different environments and has to transfer its knowledge of previously experienced environments to thus far unseen environments, while avoiding catastrophic forgetting. This is particularly relevant in the context of vision-based approaches, where the relevant features vary widely between different environments. We propose a novel approach for solving the continual SLAM problem by introducing CL-SLAM. Our approach consists of a dual-network architecture that handles both short-term adaptation and long-term memory retention by incorporating a replay buffer. Extensive evaluations of CL-SLAM in three different environments demonstrate that it outperforms several baselines inspired by existing continual learning-based visual odometry methods. ### CoVIO Visual odometry is a fundamental task for many applications on mobile devices and robotic platforms. Since such applications are oftentimes not limited to predefined target domains and learning-based vision systems are known to generalize poorly to unseen environments, methods for continual adaptation during inference time are of significant interest. In this work, we introduce CoVIO for online continual learning of visual-inertial odometry. CoVIO effectively adapts to new domains while mitigating catastrophic forgetting by exploiting experience replay. In particular, we propose a novel sampling strategy to maximize image diversity in a fixed-size replay buffer that targets the limited storage capacity of embedded devices. We further provide an asynchronous version that decouples the odometry estimation from the network weight update step enabling continuous inference in real time. We extensively evaluate CoVIO on various real-world datasets demonstrating that it successfully adapts to new domains while outperforming previous methods. # 🏗 Setup Clone repository: `git clone --recurse-submodules https://github.com/robot-learning-freiburg/CL-SLAM.git` ## ⚙️ Installation - Create conda environment: `conda create --name continual_slam python=3.8` - Activate conda environment: `conda activate continual_slam` - Install dependencies: `pip install -r requirements.txt` - For smooth development, install git hook scripts: `pre-commit install` ## 🔄 Install [g2opy](https://github.com/uoip/g2opy) We use g2o for pose graph optimization. - Apply fixes for Eigen version >= 3.3.5: `./third_party/fix_g2opy.py` - Install C++ requirements: - `conda install cmake` - `conda install -c conda-forge eigen` - `conda install -c conda-forge suitesparse` - Install g2opy: ``` cd third_party/g2opy mkdir build cd build cmake -DPYBIND11_PYTHON_VERSION=3.8 .. make -j8 |NOTE: reduce number if running out of memory cd .. |NOTE: remove any .so file which is not for Python 3.8 python setup.py install |NOTE: Ensure that the conda environment is active ``` ## 💾 Data preparation To re-train or run the experiments from our paper, please download and pre-process the respective datasets. ### Cityscapes Download the following files from https://www.cityscapes-dataset.com/downloads/: - `leftImg8bit_sequence_trainvaltest.zip` (324GB) - `timestamp_sequence.zip` (40MB) - `vehicle_sequence.zip` (56MB) - `disparity_sequence_trainvaltest.zip` (106GB) (optionally, used for computing the depth error) ### Oxford RobotCar Download the RTK ground truth from https://robotcar-dataset.robots.ox.ac.uk/ground_truth/ (91MB).
Download the camera models from https://robotcar-dataset.robots.ox.ac.uk/downloads/ (129MB).
We used the files from https://robotcar-dataset.robots.ox.ac.uk/datasets/2015-08-12-15-04-18/: - `2015-08-12-15-04-18_stereo_centre_01.tar`, `...0*.tar` (25GB) - `2015-08-12-15-04-18_gps.tar` (20MB) Undistort the center images: ```python python datasets/robotcar.py ``` ### KITTI Download the KITTI Odometry data from http://www.cvlibs.net/datasets/kitti/eval_odometry.php: - `odometry data set (color, 65 GB)` - `odometry ground truth poses (4 MB)` Download the KITTI raw data from http://www.cvlibs.net/datasets/kitti/raw_data.php for the runs specified in [`datasets/kitti.py`](datasets/kitti.py) (search for `KITTI_RAW_SEQ_MAPPING`). - `[synced+rectified data]` Download the ground truth depth from http://www.cvlibs.net/datasets/kitti/eval_depth_all.php (optionally, used for computing the depth error). - `annotated depth maps data set (14GB)` Extract the raw data matching the odometry dataset. Note that sequence 03 is excluded as no IMU data (KITTI raw) has been released. ```python python datasets/kitti.py --oxts python datasets/kitti.py --depth ``` # 🏃 Running the Code ## 🏋 Pre-training We pre-trained CL-SLAM on the Cityscapes Dataset. You can either download the resulting weights, where we masked potentially dynamic objects, or pre-train the DepthNet and PoseNet by yourself by running our code. **Note** that you have to adjust the `dataset_path` in [`config/config_pretrain.yaml`](config/config_pretrain.yaml). ```python python main_pretrain.py ``` Model weights: https://continual-slam.cs.uni-freiburg.de/downloads/cityscapes_masks_pretrain.zip (Please unzip the file after download.) ## 🗺️ Adaptation with CL-SLAM For adaptation, we used the KITTI Odometry Dataset and the Oxford RobotCar Dataset. The experiments in the paper are conducted on the KITTI sequences 09 and 10 as well as on two RobotCar sequences.
In order to fill the replay buffer with the pre-training data, please run the following script after having adjusted the paths in the file. This can take some time. ```python python make_cityscapes_buffer.py ``` In the configuration file [`config/config_adapt.yaml`](config/config_adapt.yaml), please adjust the following parameters: - `Dataset.dataset` --> Set either `Kitti` or `RobotCar` - `Dataset.dataset_path` --> Set the path to the data - `DepthPosePrediction.load_weights_folder` --> Should be the path to the weights from pre-training or the previous adaptation - `Slam.dataset_sequence` --> Set the KITTI sequence, or `1` or `2` for RobotCar - `Slam.logging` --> If this is set to true, make sure to enable dataloaders in the [`slam/slam.py`](slam/slam.py) have `with_depths` argument set to `True`, also make sure that you have `gt_depth` in your dataset folder Then run: ```python python main_adapt.py ``` ## 📒 Notes ### Continual SLAM The originally released code for *Continual SLAM*, i.e., without the extensions of *CoVIO*, can be found under commit [4ac27f6](https://github.com/robot-learning-freiburg/CL-SLAM/tree/4ac27f62478cb8301f73bb294be07b846235fe6a). ### CoVIO The asynchronous variant is provided in the [OpenDR toolkit](https://github.com/opendr-eu/opendr). ## 👩‍⚖️ License For academic usage, the code is released under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html) license. For any commercial purpose, please contact the authors. ## 🙏 Acknowledgment This work was funded by the European Union’s Horizon 2020 research and innovation program under grant agreement No 871449-OpenDR.

drawing ================================================ FILE: config/config_adapt.yaml ================================================ Dataset: dataset: Kitti # dataset: RobotCar dataset_path: USER/data/kitti/odometry/dataset frame_ids: [ 0, -1, 1 ] scales: [ 0, 1, 2, 3 ] # Provided by dataloader height: 192 width: 640 DepthPosePrediction: train_set: all val_set: 0 # train_set: 2015-08-12-15-04-18 # val_set: 2015-08-12-15-04-18 resnet_depth: 18 resnet_pose: 18 resnet_pretrained: true scales: [ 0, 1, 2, 3 ] # Network size learning_rate: 0.0001 scheduler_step_size: 15 num_workers: 0 num_epochs: 20 min_depth: .1 max_depth: disparity_smoothness: .001 velocity_loss_scaling: .05 mask_dynamic: False save_frequency: -1 save_val_depth: false save_val_depth_batches: 0 multiple_gpus: false gpu_ids: batch_size: 3 log_path: ./log/slam/c_k9 load_weights_folder: ./log/cityscapes/models/weights_025 use_wandb: False ReplayBuffer: maximize_diversity: True max_buffer_size: 100 similarity_threshold: .95 similarity_sampling: False load_path: ./log/slam/c_k9/replay_buffer LoopClosureDetection: detection_threshold: .99 id_threshold: 250 num_matches: 1 Slam: dataset_sequence: 6 adaptation: True adaptation_epochs: 5 min_distance: .2 start_frame: 0 # Start mapping after this frame logging: true do_loop_closures: true keyframe_frequency: 5 lc_distance_poses: 150 # min num consec poses between lc checks ================================================ FILE: config/config_parser.py ================================================ import dataclasses import sys from os import PathLike from pathlib import Path from typing import List, Union, get_args, get_origin import yaml from datasets import Config as Dataset from depth_pose_prediction import Config as DepthPosePrediction from loop_closure_detection import Config as LoopClosureDetection from slam import Config as Slam from slam import ReplayBufferConfig as ReplayBuffer class ConfigParser(): def __init__(self, config_file: Union[str, PathLike, Path]) -> None: self.filename = Path(config_file) self.config_dict = {} self.dataset = None self.depth_pose = None self.loop_closure = None self.slam = None self.replay_buffer = None self.parse() def parse(self): with open(self.filename, 'r', encoding='utf-8') as file: self.config_dict = yaml.safe_load(file) # Read lists as tuples for config_type in self.config_dict.values(): for key, value in config_type.items(): if isinstance(value, List): config_type[key] = tuple(value) # Correct wrongly parsed data types for config_type_key, config_type in self.config_dict.items(): config_type_class = getattr(sys.modules[__name__], config_type_key) for field in dataclasses.fields(config_type_class): if field.name == 'config_file': continue value = config_type[field.name] if value is not None: expected_type = field.type if get_origin(field.type) is not None: expected_type = get_origin(field.type) if expected_type is Union: expected_type = [] for tp in get_args(field.type): if get_origin(tp) is not None: expected_type.append(get_origin(tp)) else: expected_type.append(tp) else: expected_type = [expected_type] # Remove the NoneType before attempting conversions expected_type = [tp for tp in expected_type if tp is not type(None)] if not any(isinstance(value, tp) for tp in expected_type): if len(expected_type) == 1: print(f'[CONFIG] Converting {field.name} from {type(value).__name__} ' f'to {expected_type[0].__name__}.') config_type[field.name] = expected_type[0](value) else: assert False, 'Found an unknown issue!' elif get_origin(field.type) is Union and type(None) in get_args(field.type): # Is optional print(f'[CONFIG] Setting {field.name} to None.') elif get_origin(field.type) is Union: # Is required raise ValueError(f'[CONFIG] Required parameter missing: {field.name}') else: assert False, 'Found an unknown issue!' # Add the path to the config file to all Configurations for config_type in self.config_dict.values(): config_type['config_file'] = self.filename # Convert paths to absolute paths for config_type in self.config_dict.values(): for key, value in config_type.items(): if isinstance(value, Path): config_type[key] = value.absolute() # Read the sections if 'Dataset' in self.config_dict: self.dataset = Dataset(**self.config_dict['Dataset']) if 'DepthPosePrediction' in self.config_dict: self.depth_pose = DepthPosePrediction(**self.config_dict['DepthPosePrediction']) if 'LoopClosureDetection' in self.config_dict: self.loop_closure = LoopClosureDetection(**self.config_dict['LoopClosureDetection']) if 'Slam' in self.config_dict: self.slam = Slam(**self.config_dict['Slam']) if 'ReplayBuffer' in self.config_dict: self.replay_buffer = ReplayBuffer(**self.config_dict['ReplayBuffer']) def __str__(self): string = '' for config_type_name, config_type in self.config_dict.items(): string += f'----- {config_type_name} --- START -----\n' for name, value in config_type.items(): if name != 'config_file': string += f'{name:25} : {value}\n' string += f'----- {config_type_name} --- END -------\n' string = string[:-1] return string if __name__ == '__main__': config = ConfigParser('./config.yaml') print(config) ================================================ FILE: config/config_pretrain.yaml ================================================ Dataset: type: Cityscapes dataset_path: USER/data/cityscapes scales: [ 0, 1, 2, 3 ] # Provided by dataloader height: 192 width: 640 frame_ids: [ 0, -1, 1 ] DepthPosePrediction: train_set: [ train ] val_set: val resnet: 18 resnet_pretrained: true scales: [ 0, 1, 2, 3 ] # Network size learning_rate: 1e-4 scheduler_step_size: 15 batch_size: 18 num_workers: 12 num_epochs: 25 min_depth: .1 max_depth: disparity_smoothness: .001 velocity_loss_scaling: .05 mask_dynamic: false log_path: ./log/cityscapes save_frequency: 5 save_val_depth: true save_val_depth_batches: 1 multiple_gpus: true gpu_ids: [ 0 ] load_weights_folder: use_wandb: false ================================================ FILE: datasets/__init__.py ================================================ from datasets.cityscapes import Cityscapes from datasets.config import Dataset as Config from datasets.kitti import Kitti from datasets.robotcar import Robotcar ================================================ FILE: datasets/cityscapes.py ================================================ import json from os import PathLike from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union import cv2 import numpy as np import torch from PIL import Image from torch import Tensor from datasets.utils import Dataset class Cityscapes(Dataset): def __init__( self, data_path: Union[str, Path, PathLike], subsets: Union[str, List[str], Tuple[str, ...]], frame_ids: Union[List[int], Tuple[int, ...]], scales: Optional[Union[List[int], Tuple[int, ...]]] = None, height: Optional[int] = None, width: Optional[int] = None, do_augmentation: bool = False, views: Union[List[str], Tuple[str, ...]] = ('left', ), with_depth: bool = False, with_mask: bool = False, ) -> None: if any(v != 'left' for v in views): raise ValueError('Cityscapes supports only views = ["left"]') super().__init__(data_path, frame_ids, scales, height, width, do_augmentation, views, with_depth, with_mask) # The following subsets are available: # -) train --> used for pretraining # -) val --> used for validation during training # -) test --> not used in the published version # -) frankfurt --> not used in the published version. Refers to "allFrames_frankfurt" subsets = (subsets, ) if isinstance(subsets, str) else subsets if any(s not in ['train', 'val', 'test', 'frankfurt'] for s in subsets): raise ValueError('subsets must be one of ["train", "val", "test"]') if 'frankfurt' in subsets and len(subsets) > 1: raise ValueError('The subset "frankfurt" cannot be combined with other subsets.') self.subsets = subsets self._load_image_filenames() self.vehicle_filenames = self._get_filenames(mode='vehicle') self.timestamp_filenames = self._get_filenames(mode='timestamp') self.disparity_filenames = self._get_filenames(mode='disparity') if self.with_depth else [] if self.with_mask: self._load_mask_filenames() def _get_filenames(self, mode: str) -> List[Path]: if self.subsets[0] == 'frankfurt': valid_modes = ['rgb_left', 'vehicle', 'timestamp'] if mode not in valid_modes: raise ValueError(f'mode must be one of {valid_modes}. [subset == "frankfurt"]') mode_dir = { 'rgb_left': 'leftImg8bit_allFrames', 'vehicle': 'vehicle_allFrames', 'timestamp': 'timestamp_allFrames', }[mode] subsets = ('val', ) else: valid_modes = ['rgb_left', 'vehicle', 'timestamp', 'disparity', 'mask_left'] if mode not in valid_modes: raise ValueError(f'mode must be one of {valid_modes}.') mode_dir = { 'rgb_left': 'leftImg8bit_sequence', 'vehicle': 'vehicle_sequence', 'timestamp': 'timestamp_sequence', 'disparity': 'disparity_sequence', 'mask_left': 'segm_mask_sequence', }[mode] subsets = self.subsets mode_ext = { 'rgb_left': 'png', 'vehicle': 'json', 'timestamp': 'txt', 'disparity': 'png', 'mask_left': 'png', }[mode] filenames = [] counter_indices = 0 for subset in subsets: subset_data_path = self.data_path / mode_dir / subset cities = sorted(subset_data_path.glob('*')) for city_path in cities: city_filenames = sorted(city_path.glob(f'*.{mode_ext}')) filenames += city_filenames city_sequences = self._divide_into_sequences(city_filenames) for city_sequence, count in city_sequences.items(): if city_sequence not in self.sequence_indices: self.sequence_indices[city_sequence] = (counter_indices, counter_indices + count - 1) counter_indices += count return filenames @staticmethod def _divide_into_sequences(city_filenames: List[Path]) -> Dict[str, int]: # filename: ___. # Both and are used in this function to detect the start of a new sequence. filenames = [f.stem for f in city_filenames] city = city_filenames[0].stem.split('_')[0] city_sequences = {} sequence_length = 1 sequence_counter = 0 for file_1, file_2 in zip(filenames, filenames[1:]): seq_1, seq_2 = int(file_1.split('_')[1]), int(file_2.split('_')[1]) cnt_1, cnt_2 = int(file_1.split('_')[2]), int(file_2.split('_')[2]) if seq_1 == seq_2 and cnt_1 + 1 == cnt_2: # Continuation of the same sequence sequence_length += 1 elif (seq_1 != seq_2) or (seq_1 == seq_2 and cnt_1 + 1 != cnt_2): # New sequence started. Either increment in seq or non-consecutive cnt. city_sequences[f'{city}_{sequence_counter:06}'] = sequence_length sequence_length = 1 sequence_counter += 1 else: raise RuntimeError('Ran into an unexpected situation.') # Add the final sequence city_sequences[f'{city}_{sequence_counter:06}'] = sequence_length return city_sequences def __getitem__(self, index: int) -> Dict[Any, Tensor]: img_filenames, mask_filenames, index, do_color_augmentation, do_flip = self._pre_getitem( index) # load the color images item = {('index'): index} original_image_shape = None for frame_id in self.frame_ids: rgb = Image.open(img_filenames[index + frame_id]).convert('RGB') if frame_id == 0: original_image_shape = rgb.size if do_flip: rgb = rgb.transpose(Image.FLIP_LEFT_RIGHT) if len(self.scales) > 0: rgb = self.resize[0](rgb) item[('rgb', frame_id, 0)] = rgb else: item[('rgb', frame_id, -1)] = rgb # Adjusting intrinsics to match each scale in the pyramid normalized_camera_matrix, baseline = self._load_camera_calibration( index, original_image_shape) for scale in range(len(self.scales)): camera_matrix, inv_camera_matrix = self._scale_camera_matrix( normalized_camera_matrix, scale) item[('camera_matrix', scale)] = camera_matrix item[('inv_camera_matrix', scale)] = inv_camera_matrix if len(self.scales) == 0: # Load camera matrix of raw image data camera_matrix, baseline = self._load_camera_calibration(index, (1, 1)) item[('camera_matrix', -1)] = camera_matrix item[('inv_camera_matrix', -1)] = np.linalg.pinv(camera_matrix) # Load mask of potentially dynamic objects if self.with_mask: frame_id = 0 mask = Image.open(mask_filenames[index + frame_id]) if do_flip: mask = mask.transpose(Image.FLIP_LEFT_RIGHT) if len(self.scales) > 0: mask = self.resize[0](mask) item[('mask', frame_id, 0)] = mask else: item[('mask', frame_id, -1)] = mask # Load relative distance (except for first frame) for frame_id in self.frame_ids[1:]: item[('relative_distance', frame_id)] = self._load_relative_distance(index + frame_id) # Load unscaled depth based on the provided disparity maps if self.with_depth: for frame_id in self.frame_ids: depth = self._load_depth(index + frame_id, normalized_camera_matrix, baseline, original_image_shape) if do_flip: depth = np.fliplr(depth) item[('depth', frame_id, -1)] = depth self._post_getitem(item, do_color_augmentation) # This will also call _preprocess() return item def _load_camera_calibration( self, index: int, image_shape: Tuple[int, int], ) -> Tuple[np.ndarray, float]: """ Returns the intrinsics matrix normalized by the original image size. """ width, height = image_shape city = self.vehicle_filenames[index].parent.name subset = self.vehicle_filenames[index].parents[1].name sequence = '_'.join(self.vehicle_filenames[index].name.split('_')[:2]) data_path = self.data_path / 'camera' / subset / city # We assume that the camera intrinsics are constant for one recording camera_file = sorted(data_path.glob(f'{sequence}_*_camera.json'))[0] with open(camera_file, 'r', encoding='utf-8') as f: data = json.load(f) intrinsics = data['intrinsic'] baseline = data['extrinsic']['baseline'] camera_matrix = np.array( [[intrinsics['fx'], 0, intrinsics['u0'], 0], [0, intrinsics['fy'], intrinsics['v0'], 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) camera_matrix[0, :] /= width camera_matrix[1, :] /= height return camera_matrix, baseline def _load_relative_distance(self, index: int) -> float: """ Distance in meters and with respect to the previous frame. """ previous_timestamp = np.loadtxt(str(self.timestamp_filenames[index - 1])) current_timestamp = np.loadtxt(str(self.timestamp_filenames[index])) delta_timestamp = (current_timestamp - previous_timestamp) / 1e9 # ns to s with open(self.vehicle_filenames[index - 1], 'r', encoding='utf-8') as f: previous_speed = json.load(f)['speed'] with open(self.vehicle_filenames[index], 'r', encoding='utf-8') as f: current_speed = json.load(f)['speed'] speed = (previous_speed + current_speed) / 2 # m/s distance = speed * delta_timestamp # m return distance def _load_depth( self, index: int, scaled_camera_matrix: np.ndarray, baseline: float, image_shape: Tuple[int, int], ) -> np.ndarray: """ Depth in meters computed based on the provided disparity maps. Zero elements encode missing disparity values. """ disparity = cv2.imread(str(self.disparity_filenames[index]), cv2.IMREAD_UNCHANGED).astype(np.float32) disparity[disparity > 0] = (disparity[disparity > 0] - 1) / 256 # According to README # Reconstruct the value of the raw data focal_length_x = scaled_camera_matrix[0, 0] * image_shape[0] depth = np.zeros_like(disparity) depth[disparity > 0] = (baseline * focal_length_x) / disparity[disparity > 0] # m return depth def _preprocess( self, item: Dict[Any, Any], augment: Union[Callable, Tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]], ) -> None: # Apply augmentation # Convert to list object as we are changing the size during the iteration for key in list(item.keys()): value = item[key] if 'rgb' in key: k, frame_id, scale = key item[key] = self._to_tensor(value) item[(f'{k}_aug', frame_id, scale)] = self._to_tensor(augment(value)) elif 'camera_matrix' in key or 'inv_camera_matrix' in key: item[key] = torch.from_numpy(value) elif 'depth' in key: item[key] = self._to_tensor(value) elif 'mask' in key: item[key] = torch.round(self._to_tensor(value)) # if __name__ == '__main__': # dataset = Cityscapes('/home/voedisch/data/cityscapes', ('frankfurt'), (0, -1, 1), # (0, 1, 2, 3), # 192, # 640, # with_mask=False, # with_depth=True) # print(dataset[0].keys()) ================================================ FILE: datasets/config.py ================================================ import dataclasses from pathlib import Path from typing import Optional, Tuple @dataclasses.dataclass class Dataset: dataset: str config_file: Path dataset_path: Optional[Path] scales: Optional[Tuple[int, ...]] height: Optional[int] width: Optional[int] frame_ids: Tuple[int, ...] ================================================ FILE: datasets/kitti.py ================================================ # Adapted from: # https://github.com/nianticlabs/monodepth2/blob/master/datasets/kitti_dataset.py # https://github.com/nianticlabs/monodepth2/blob/master/datasets/mono_dataset.py#L28 import argparse from datetime import datetime from os import PathLike from pathlib import Path from shutil import copyfile from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import torch from PIL import Image from torch import Tensor from tqdm import tqdm from datasets.utils import Dataset class Kitti(Dataset): def __init__( self, data_path: Union[str, Path, PathLike], sequences: Union[int, List[int], Tuple[int, ...]], frame_ids: Union[List[int], Tuple[int, ...]], scales: Optional[Union[List[int], Tuple[int, ...]]] = None, height: Optional[int] = None, width: Optional[int] = None, do_augmentation: bool = False, views: Union[List[str], Tuple[str, ...]] = ('left', ), with_depth: bool = False, with_mask: bool = False, poses: bool = False, min_distance: float = 0, ) -> None: super().__init__(data_path, frame_ids, scales, height, width, do_augmentation, views, with_depth, with_mask, min_distance=min_distance) # if self.with_depth and (sequences != 8 and sequences[0] != 8): # raise ValueError('gt_depth only supported for sequence 8') self.include_poses = poses sequences = (sequences, ) if isinstance(sequences, int) else sequences if any(s > 10 for s in sequences): raise ValueError('Passed a sequence without ground truth data.') if 3 in sequences: raise ValueError('Passed a sequence without IMU data.') self.sequences = sorted(sequences) # NOTE: Make sure your intrinsics matrix is *normalized* by the original image size. # To normalize you need to scale the first row by 1 / image_width and the second row # by 1 / image_height. Monodepth2 assumes a principal point to be exactly centered. # If your principal point is far from the center you might need to disable the horizontal # flip augmentation. self.camera_matrix = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) self._load_image_filenames() self.left_depth_filenames = self._get_filenames( mode='depth_left') if self.with_depth else [] if self.with_mask: self._load_mask_filenames() self.velocity_filenames = self._get_filenames('velocity') self.timestamps = self._load_timestamps() self.global_poses = self._load_global_poses() # Make sure that the RGB images and the depth data matches # Adjust other data as well if self.with_depth: assert len(sequences) == 1 # ToDo: update sequence_indices depth_numbers = [int(d.stem) for d in self.left_depth_filenames] tmp_images = [] tmp_velocity = [] tmp_timestamps = [] tmp_masks = [] tmp_poses = [] for i, img in enumerate(self.left_img_filenames): if int(img.stem) in depth_numbers: tmp_images.append(img) tmp_velocity.append(self.velocity_filenames[i]) tmp_timestamps.append(self.timestamps[i]) tmp_poses.append(self.global_poses[i]) if self.with_mask: tmp_masks.append(self.left_mask_filenames[i]) self.left_img_filenames = tmp_images self.velocity_filenames = tmp_velocity self.timestamps = tmp_timestamps self.global_poses = np.stack(tmp_poses) self.left_mask_filenames = tmp_masks self.sequence_indices[sequences[0]] = (0, len(self.left_img_filenames) - 1) # Filter data to meet minimum distance requirements self.relative_distances = self._compute_relative_distance() if self.min_distance > 0: assert len(sequences) == 1 # ToDo: update sequence_indices self._filter_by_distance(self.min_distance) self.relative_poses = self._load_relative_poses() # Has to be done in the end def _get_filenames(self, mode: str) -> List[Path]: """Load the file names of the corresponding subfolders of a sequence. """ valid_modes = [ 'rgb_left', 'rgb_right', 'lidar', 'depth_left', 'mask_left', 'mask_right', 'velocity' ] if mode not in valid_modes: raise ValueError(f'mode must be one of {valid_modes}.') mode_dir = { 'rgb_left': 'image_2', 'rgb_right': 'image_3', 'lidar': 'velodyne', 'depth_left': 'gt_depth/image_02', 'mask_left': 'segm_mask/image_2', 'mask_right': 'segm_mask/image_3', 'velocity': 'oxts/data', }[mode] mode_ext = { 'rgb_left': 'png', 'rgb_right': 'png', 'lidar': 'bin', 'depth_left': 'png', 'mask_left': 'png', 'mask_right': 'png', 'velocity': 'txt', }[mode] filenames = [] for sequence in self.sequences: sequence_index = [len(filenames)] sequence_data_path = self.data_path / 'sequences' / f'{sequence:02}' / mode_dir filenames += sorted(sequence_data_path.glob(f'*.{mode_ext}')) sequence_index.append(len(filenames) - 1) if sequence not in self.sequence_indices: self.sequence_indices[sequence] = tuple(sequence_index) return filenames def _load_timestamps(self) -> List[int]: timestamp_format = '%Y-%m-%d %H:%M:%S.%f' timestamps_ = [] for sequence in self.sequences: # Load timestamps as written in the file, e.g., 2011-10-03 12:55:34.997992704 timestamps_file = self.data_path / 'sequences' / f'{sequence:02}' / 'oxts' / \ 'timestamps.txt' with open(timestamps_file, 'r', encoding='utf-8') as f: str_timestamps = f.read().splitlines() # Convert relative to timestamp of initial frame and output in seconds # Discard nanoseconds as they are not supported by datetime sequence_timestamps = [] for timestamp in str_timestamps: sequence_timestamps.append( (datetime.strptime(timestamp[:-3], timestamp_format) - datetime.strptime(str_timestamps[0][:-3], timestamp_format)).total_seconds()) timestamps_ += sequence_timestamps return timestamps_ def _load_global_poses(self) -> np.ndarray: global_poses = [] for sequence in self.sequences: # Load global poses poses_data_path = self.data_path / 'poses' / f'{sequence:02}.txt' sequence_poses = np.loadtxt(str(poses_data_path), dtype=np.float32).reshape((-1, 3, 4)) # Convert to homogenous coordinates sequence_poses = np.concatenate( (sequence_poses, np.zeros((sequence_poses.shape[0], 1, 4), dtype=np.float32)), 1) sequence_poses[:, 3, 3] = 1 global_poses.append(sequence_poses) return np.concatenate(global_poses) def _load_relative_poses(self) -> np.ndarray: relative_poses = [] for sequence in self.sequences: indices = self.sequence_indices[sequence] sequence_poses = self.global_poses[indices[0]:indices[1] + 1] # Convert global poses to poses that are relative to the previous frame # The initial value is zero as there is no previous frame sequence_poses_relative = [np.eye(4, dtype=np.float32)] for i in range(1, len(sequence_poses)): sequence_poses_relative.append( np.linalg.inv(sequence_poses[i - 1]) @ sequence_poses[i]) sequence_poses_relative = np.stack(sequence_poses_relative) relative_poses.append(sequence_poses_relative) return np.concatenate(relative_poses) def _compute_relative_distance(self) -> List[float]: relative_distances = [0.] for index in range(1, len(self.timestamps)): relative_distances.append(self._load_relative_distance(index)) return relative_distances def _filter_by_index(self, keep_indices: Union[np.ndarray, List[int]]) -> None: # Remove all timestamps, images, and velocities without poses self.left_img_filenames = [self.left_img_filenames[i] for i in keep_indices] if self.left_img_filenames else [] self.right_img_filenames = [self.right_img_filenames[i] for i in keep_indices] if self.right_img_filenames else [] self.left_mask_filenames = [self.left_mask_filenames[i] for i in keep_indices] if self.left_mask_filenames else [] self.right_mask_filenames = [self.right_mask_filenames[i] for i in keep_indices] if self.right_mask_filenames else [] self.left_depth_filenames = [self.left_depth_filenames[i] for i in keep_indices] if self.left_depth_filenames else [] self.velocity_filenames = [self.velocity_filenames[i] for i in keep_indices] self.timestamps = [self.timestamps[i] for i in keep_indices] self.global_poses = [self.global_poses[i] for i in keep_indices] def _filter_by_distance(self, min_distance: float) -> None: distance = 0 keep_indices = [0] relative_distances = [0] for i, relative_distance in enumerate(self.relative_distances[1:], start=1): distance += np.abs(relative_distance) if distance >= min_distance: keep_indices.append(i) relative_distances.append(distance) distance = 0 self._filter_by_index(keep_indices) # Do no re-compute with function to exploit higher frequency self.relative_distances = relative_distances def __getitem__(self, index: int) -> Dict[Any, Tensor]: """ ('rgb', , ) ('camera_matrix', ) ('inv_camera_matrix', ) ('relative_pose', ) | with respect to frame_id - 1 is relative to the requested index, i.e., the temporal neighbors == -1 denotes the original size and is deleted before returning """ img_filenames, mask_filenames, index, do_color_augmentation, do_flip = self._pre_getitem( index) # Load the color images item = {('index'): index} original_image_shape = None for frame_id in self.frame_ids: rgb = Image.open(img_filenames[index + frame_id]).convert('RGB') if frame_id == 0: original_image_shape = rgb.size if do_flip: rgb = rgb.transpose(Image.FLIP_LEFT_RIGHT) if len(self.scales) > 0: # Immediately rescale since the images are of various size across the sequences # Keeping the original resolution means that pytorch cannot batch images from # different sequences rgb = self.resize[0](rgb) item[('rgb', frame_id, 0)] = rgb else: item[('rgb', frame_id, -1)] = rgb # Adjusting intrinsics to match each scale in the pyramid for scale in range(len(self.scales)): camera_matrix, inv_camera_matrix = self._scale_camera_matrix(self.camera_matrix, scale) item[('camera_matrix', scale)] = camera_matrix item[('inv_camera_matrix', scale)] = inv_camera_matrix if len(self.scales) == 0: # Load camera matrix of raw image data camera_matrix = self.camera_matrix.copy() camera_matrix[0, :] *= original_image_shape[0] camera_matrix[1, :] *= original_image_shape[1] item[('camera_matrix', -1)] = camera_matrix item[('inv_camera_matrix', -1)] = np.linalg.pinv(camera_matrix) # Load mask of potentially dynamic objects if self.with_mask: frame_id = 0 mask = Image.open(mask_filenames[index + frame_id]) if do_flip: mask = mask.transpose(Image.FLIP_LEFT_RIGHT) if len(self.scales) > 0: mask = self.resize[0](mask) item[('mask', frame_id, 0)] = mask else: item[('mask', frame_id, -1)] = mask # Load unscaled depth if self.with_depth: # Debugging checks for frame_id in self.frame_ids: assert int(img_filenames[index + frame_id].stem) == int( self.left_depth_filenames[index + frame_id].stem) for frame_id in self.frame_ids: assert self.views == ('left', ) depth = Image.open(self.left_depth_filenames[index + frame_id]) if do_flip: depth = depth.transpose(Image.FLIP_LEFT_RIGHT) item[('depth', frame_id, -1)] = depth # Load relative distance (except for first frame) for frame_id in self.frame_ids[1:]: item['relative_distance', frame_id] = self.relative_distances[index + frame_id] if self.include_poses: # Load the poses (except for the first frame) for frame_id in self.frame_ids[1:]: item[('relative_pose', frame_id)] = self.relative_poses[index + frame_id] if do_flip: # We also have to flip the relative pose (rotate around y-axis) item[('relative_pose', 0)][2, 0] *= -1 item[('relative_pose', 0)][0, 2] *= -1 item[('absolute_pose', frame_id)] = self.global_poses[index + frame_id] self._post_getitem(item, do_color_augmentation) # This will also call _preprocess() return item def _load_relative_distance(self, index: int) -> float: """ Distance in meters and with respect to the previous frame. """ previous_timestamp = self.timestamps[index - 1] current_timestamp = self.timestamps[index] delta_timestamp = current_timestamp - previous_timestamp # s # Compute speed as norm of forward, leftward, and upward elements previous_speed = np.linalg.norm(np.loadtxt(str(self.velocity_filenames[index - 1]))[8:11]) current_speed = np.linalg.norm(np.loadtxt(str(self.velocity_filenames[index]))[8:11]) speed = (previous_speed + current_speed) / 2 # m/s distance = speed * delta_timestamp return distance def _preprocess( self, item: Dict[Any, Tensor], augment: Union[Callable, Tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]], ) -> None: # Apply augmentation # Convert to list object as we are changing the size during the iteration for key in list(item.keys()): value = item[key] if 'rgb' in key: k, frame_id, scale = key item[key] = self._to_tensor(value) # if self.do_augmentation: item[(f'{k}_aug', frame_id, scale)] = self._to_tensor(augment(value)) elif 'camera_matrix' in key or 'inv_camera_matrix' in key: item[key] = torch.from_numpy(value) elif 'depth' in key: item[key] = self._to_tensor(value) / 100 # Convert cm to m elif 'relative_pose' in key or 'absolute_pose' in key: item[key] = self._to_tensor(value) elif 'mask' in key: item[key] = torch.round(self._to_tensor(value)) # ========================================================= def extract_raw_data( raw_dataset_path: Path, odometry_dataset_path: Path, oxts: bool = True, gt_depth: bool = False, ) -> None: # yapf: disable # Mapping between KITTI Raw Drives and KITTI Odometry Sequences KITTI_RAW_SEQ_MAPPING = { 0: {'date': '2011_10_03', 'drive': 27, 'start_frame': 0, 'end_frame': 4540}, 1: {'date': '2011_10_03', 'drive': 42, 'start_frame': 0, 'end_frame': 1100}, 2: {'date': '2011_10_03', 'drive': 34, 'start_frame': 0, 'end_frame': 4660}, # 3: {'date': '2011_09_26', 'drive': 67, 'start_frame': 0, 'end_frame': 800}, # No IMU 4: {'date': '2011_09_30', 'drive': 16, 'start_frame': 0, 'end_frame': 270}, 5: {'date': '2011_09_30', 'drive': 18, 'start_frame': 0, 'end_frame': 2760}, 6: {'date': '2011_09_30', 'drive': 20, 'start_frame': 0, 'end_frame': 1100}, 7: {'date': '2011_09_30', 'drive': 27, 'start_frame': 0, 'end_frame': 1100}, 8: {'date': '2011_09_30', 'drive': 28, 'start_frame': 1100, 'end_frame': 5170}, 9: {'date': '2011_09_30', 'drive': 33, 'start_frame': 0, 'end_frame': 1590}, 10: {'date': '2011_09_30', 'drive': 34, 'start_frame': 0, 'end_frame': 1200}, } # yapf: enable total_frames = 0 for mapping in KITTI_RAW_SEQ_MAPPING.values(): total_frames += (mapping['end_frame'] - mapping['start_frame'] + 1) if gt_depth: with tqdm(desc='Copying depth files', total=total_frames * 2, unit='files') as pbar: for sequence, mapping in KITTI_RAW_SEQ_MAPPING.items(): # This is the improved gt depth using 5 consecutive frames from # "Sparsity invariant CNNs", J. Uhrig et al., 3DV, 2017. odometry_sequence_path = odometry_dataset_path / f'{sequence:02}' / 'gt_depth' split = 'val' if sequence == 4 else 'train' raw_sequence_path = raw_dataset_path / split / \ f'{mapping["date"]}_drive_{mapping["drive"]:04}_sync' / \ 'proj_depth' / 'groundtruth' if not raw_sequence_path.exists(): continue for image in ['image_02', 'image_03']: image_raw_sequence_path = raw_sequence_path / image (odometry_sequence_path / image).mkdir(exist_ok=True, parents=True) raw_filenames = sorted(image_raw_sequence_path.glob('*')) for raw_filename in raw_filenames: odometry_filename = odometry_sequence_path / image / raw_filename.name frame = int(raw_filename.stem) if mapping['start_frame'] <= frame <= mapping['end_frame']: copyfile(raw_filename, odometry_filename) pbar.update(1) pbar.set_postfix({'sequence': sequence}) if oxts: with tqdm(desc='Copying OXTS files', total=total_frames, unit='files') as pbar: for sequence, mapping in KITTI_RAW_SEQ_MAPPING.items(): odometry_sequence_path = odometry_dataset_path / f'{sequence:02}' / 'oxts' raw_sequence_path = raw_dataset_path / \ f'{mapping["date"]}' / \ f'{mapping["date"]}_drive_{mapping["drive"]:04}_sync' / \ 'oxts' if not raw_sequence_path.exists(): continue odometry_sequence_path.mkdir(exist_ok=True, parents=True) copyfile(raw_sequence_path / 'dataformat.txt', odometry_sequence_path / 'dataformat.txt') with open(raw_sequence_path / 'timestamps.txt', 'r', encoding='utf-8') as f: timestamps = f.readlines()[mapping['start_frame']:mapping['end_frame'] + 1] with open(odometry_sequence_path / 'timestamps.txt', 'w', encoding='utf-8') as f: f.writelines(timestamps) for image in ['data']: image_raw_sequence_path = raw_sequence_path / image (odometry_sequence_path / image).mkdir(exist_ok=True, parents=True) raw_filenames = sorted(image_raw_sequence_path.glob('*')) for raw_filename in raw_filenames: odometry_filename = odometry_sequence_path / image / raw_filename.name frame = int(raw_filename.stem) if mapping['start_frame'] <= frame <= mapping['end_frame']: copyfile(raw_filename, odometry_filename) pbar.update(1) pbar.set_postfix({'sequence': sequence}) # ========================================================= if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('raw_path', type=str) parser.add_argument('odom_path', type=str) group = parser.add_mutually_exclusive_group() group.add_argument('--oxts', action='store_true') group.add_argument('--depth', action='store_true') args = parser.parse_args() extract_raw_data(Path(args.raw_path), Path(args.odom_path), oxts=args.oxts, gt_depth=args.depth) # seq = [i for i in range(11) if i != 3] # dataset = Kitti('/home/voedisch/data/kitti/odometry/dataset', # seq, (0, -1, 1), (0, 1, 2, 3), # 192, # 640, # with_mask=True, # poses=True) # print(dataset[0].keys()) ================================================ FILE: datasets/robotcar.py ================================================ import argparse import bisect import multiprocessing as mp import os import re from functools import partial from math import sqrt from os import PathLike from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd import torch from colour_demosaicing import demosaicing_CFA_Bayer_bilinear as demosaic from PIL import Image from scipy.interpolate import interp1d from scipy.ndimage import map_coordinates from scipy.spatial.transform import Rotation from torch import Tensor from tqdm import tqdm from datasets.utils import Dataset class Robotcar(Dataset): def __init__( self, data_path: Union[str, Path, PathLike], sequences: Union[str, List[str], Tuple[str, ...]], frame_ids: Union[List[int], Tuple[int, ...]], scales: Optional[Union[List[int], Tuple[int, ...]]] = None, height: Optional[int] = None, width: Optional[int] = None, do_augmentation: bool = False, views: Union[List[str], Tuple[str, ...]] = ('left', ), with_depth: bool = False, with_mask: bool = False, poses: bool = False, min_distance: float = 0, every_n_frame: int = 1, start_frame: int = 750, end_frame: int = -1, ) -> None: if with_mask: raise ValueError('Robotcar does not support loading masks.') if with_depth: raise ValueError('Robotcar does not support loading depth.') # This is actually the center if any(v != 'left' for v in views): raise ValueError('Robotcar supports only views = ["left"]') super().__init__(data_path, frame_ids, scales, height, width, do_augmentation, views, with_depth, with_mask, min_distance=min_distance) self.include_poses = poses self.every_n_frame = every_n_frame self.start_frame = start_frame self.end_frame = end_frame sequences = (sequences, ) if isinstance(sequences, str) else sequences self.sequences = sequences self._load_image_filenames() self.timestamps = [int(f.stem) for f in self.left_img_filenames] self.camera_matrix = self._load_camera_calibration() self.velocity = self._load_velocity() self.relative_distances = self._compute_relative_distance() self.global_poses = None self.relative_poses = None if self.include_poses: self.global_poses = self._load_global_poses() # Filter data to meet minimum distance requirements if self.min_distance > 0: self._filter_by_distance(self.min_distance) if self.include_poses: self.relative_poses = self._load_relative_poses() # Has to be done in the end def _get_filenames(self, mode: str) -> List[Path]: valid_modes = ['rgb_left'] if mode not in valid_modes: raise ValueError(f'mode must be one of {valid_modes}') mode_dir = { 'rgb_left': 'stereo/center', }[mode] mode_ext = { 'rgb_left': 'png', }[mode] filenames = [] for sequence in self.sequences: sequence_index = [len(filenames)] sequence_data_path = self.data_path / sequence / mode_dir filenames += sorted(sequence_data_path.glob( f'*.{mode_ext}'))[self.start_frame:self.end_frame:self.every_n_frame] sequence_index.append(len(filenames) - 1) if sequence not in self.sequence_indices: self.sequence_indices[sequence] = tuple(sequence_index) return filenames def _load_velocity(self) -> List[float]: """ Returns corresponding velocity for each image """ speed = [] for sequence in self.sequences: ins_file = self.data_path / sequence / 'gps' / 'ins.csv' column_list = ['timestamp', 'velocity_north', 'velocity_east', 'velocity_down'] data = pd.read_csv(ins_file, usecols=column_list) raw_timestamps = data['timestamp'].to_numpy() raw_speed = np.linalg.norm(data.to_numpy()[:, 1:], axis=1) # m/s # Linearly interpolate speed for each frame speed = interp1d(raw_timestamps, raw_speed)(self.timestamps) return speed def _load_camera_calibration(self) -> np.ndarray: """ Returns the intrinsics matrix normalized by the original image size. """ rgb = Image.open(self.left_img_filenames[0]).convert('RGB') width, height = rgb.size camera_file = self.data_path / 'camera_models' / 'stereo_narrow_left.txt' with open(camera_file, 'r', encoding='utf-8') as f: vals = [float(x) for x in next(f).split()] focal_length = (vals[0], vals[1]) principal_point = (vals[2], vals[3]) camera_matrix = np.array( [[focal_length[0], 0, principal_point[0], 0], [0, focal_length[1], principal_point[1], 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) camera_matrix[0, :] /= width camera_matrix[1, :] /= height return camera_matrix def _load_global_poses(self) -> np.ndarray: """ Reads the poses based on the provided RTK data """ global_poses = [] relative_poses = [] for sequence in self.sequences: rtk_file = self.data_path / 'rtk' / sequence / 'rtk.csv' column_list = ['timestamp', 'northing', 'easting', 'down', 'roll', 'pitch', 'yaw'] data = pd.read_csv(rtk_file, usecols=column_list) timestamps = data['timestamp'].to_numpy() utm = data.to_numpy()[:, 1:4] rpy = data.to_numpy()[:, 4:] # pylint: disable=no-member utm -= utm[0, :] # Set first pose to origin (0, 0, 0) to avoid large numbers utm[:, [1, 2]] = utm[:, [2, 1]] # Swap y and z axes rpy[:, [1, 2]] = rpy[:, [2, 1]] utm[:, 2] *= -1 sequence_poses = list(_xyzrpy_to_tmat(utm, rpy).astype(dtype=np.float32)) sequence_poses = _interpolate_poses(timestamps, sequence_poses, self.timestamps, self.timestamps[0]) sequence_poses = np.stack(sequence_poses) global_poses.append(sequence_poses) return np.concatenate(global_poses) def _load_relative_poses(self) -> np.ndarray: relative_poses = [] for sequence in self.sequences: indices = self.sequence_indices[sequence] sequence_poses = self.global_poses[indices[0]:indices[1] + 1] # Convert global poses to poses that are relative to the previous frame # The initial value is zero as there is no previous frame sequence_poses_relative = [np.eye(4, dtype=np.float32)] for i in range(1, len(sequence_poses)): sequence_poses_relative.append( np.linalg.inv(sequence_poses[i - 1]) @ sequence_poses[i]) sequence_poses_relative = np.stack(sequence_poses_relative) relative_poses.append(sequence_poses_relative) return np.concatenate(relative_poses) def _compute_relative_distance(self) -> List[float]: relative_distances = [0.] for index in range(1, len(self.timestamps)): relative_distances.append(self._load_relative_distance(index)) return relative_distances def _filter_by_index(self, keep_indices: Union[np.ndarray, List[int]]) -> None: # Remove all timestamps, images, and velocities without poses self.left_img_filenames = [self.left_img_filenames[i] for i in keep_indices] if self.left_img_filenames else [] self.right_img_filenames = [self.right_img_filenames[i] for i in keep_indices] if self.right_img_filenames else [] self.left_mask_filenames = [self.left_mask_filenames[i] for i in keep_indices] if self.left_mask_filenames else [] self.right_mask_filenames = [self.right_mask_filenames[i] for i in keep_indices] if self.right_mask_filenames else [] self.velocity = [self.velocity[i] for i in keep_indices] self.timestamps = [self.timestamps[i] for i in keep_indices] self.global_poses = [self.global_poses[i] for i in keep_indices] def _filter_by_distance(self, min_distance: float) -> None: distance = 0 keep_indices = [0] relative_distances = [0] for i, relative_distance in enumerate(self.relative_distances[1:], start=1): distance += np.abs(relative_distance) if distance >= min_distance: keep_indices.append(i) relative_distances.append(distance) distance = 0 self._filter_by_index(keep_indices) # Do no re-compute with function to exploit higher frequency self.relative_distances = relative_distances def __getitem__(self, index: int) -> Dict[Any, Tensor]: img_filenames, mask_filenames, index, do_color_augmentation, do_flip = self._pre_getitem( index) # Load the color images item = {('index'): index} original_image_shape = None for frame_id in self.frame_ids: rgb = Image.open(img_filenames[index + frame_id]).convert('RGB') if frame_id == 0: original_image_shape = rgb.size if do_flip: rgb = rgb.transpose(Image.FLIP_LEFT_RIGHT) if len(self.scales) > 0: # Immediately rescale since the images are of various size across the sequences # Keeping the original resolution means that pytorch cannot batch images from # different sequences rgb = self.resize[0](rgb) item[('rgb', frame_id, 0)] = rgb else: item[('rgb', frame_id, -1)] = rgb # Adjusting intrinsics to match each scale in the pyramid for scale in range(len(self.scales)): camera_matrix, inv_camera_matrix = self._scale_camera_matrix(self.camera_matrix, scale) item[('camera_matrix', scale)] = camera_matrix item[('inv_camera_matrix', scale)] = inv_camera_matrix if len(self.scales) == 0: # Load camera matrix of raw image data camera_matrix = self.camera_matrix.copy() camera_matrix[0, :] *= original_image_shape[0] camera_matrix[1, :] *= original_image_shape[1] item[('camera_matrix', -1)] = camera_matrix item[('inv_camera_matrix', -1)] = np.linalg.pinv(camera_matrix) # Load relative distance (except for first frame) for frame_id in self.frame_ids[1:]: item['relative_distance', frame_id] = self.relative_distances[index + frame_id] if self.include_poses: # Load the poses (except for the first frame) for frame_id in self.frame_ids[1:]: item['relative_pose', frame_id] = self.relative_poses[index + frame_id] if do_flip: # We also have to flip the relative pose (rotate around y-axis) item[('relative_pose', 0)][2, 0] *= -1 item[('relative_pose', 0)][0, 2] *= -1 item[('absolute_pose', frame_id)] = self.global_poses[index + frame_id] self._post_getitem(item, do_color_augmentation) # This will also call _preprocess() return item def _load_relative_distance(self, index: int) -> float: """ Distance in meters and with respect to the previous frame """ previous_timestamp = self.timestamps[index - 1] current_timestamp = self.timestamps[index] delta_timestamp = (current_timestamp - previous_timestamp) / 1e6 # ms to s speed = self.velocity[index] # m/s distance = speed * delta_timestamp # m return distance def _preprocess( self, item: Dict[Any, Tensor], augment: Union[Callable, Tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]], ) -> None: # Apply augmentation # Convert to list object as we are changing the size during the iteration for key in list(item.keys()): value = item[key] if 'rgb' in key: k, frame_id, scale = key item[key] = self._to_tensor(value) # if self.do_augmentation: item[(f'{k}_aug', frame_id, scale)] = self._to_tensor(augment(value)) elif 'camera_matrix' in key or 'inv_camera_matrix' in key: item[key] = torch.from_numpy(value) elif 'relative_pose' in key or 'absolute_pose' in key: item[key] = self._to_tensor(value) # ========================================================= def _xyzrpy_to_tmat(utm: np.ndarray, rpy: np.ndarray) -> np.ndarray: assert utm.shape == rpy.shape poses = np.array([np.eye(4)] * utm.shape[0]) poses[:, :3, :3] = Rotation.from_euler('zyx', rpy).as_matrix() poses[:, :3, 3] = utm return poses # Adapted from: # https://github.com/ori-mrg/robotcar-dataset-sdk/blob/master/python/interpolate_poses.py def _interpolate_poses(pose_timestamps, abs_poses, requested_timestamps_, origin_timestamp, absolute_poses=False): """Interpolate between absolute poses. Args: pose_timestamps (list[int]): Timestamps of supplied poses. Must be in ascending order. abs_poses (list[numpy.matrixlib.defmatrix.matrix]): SE3 matrices representing poses at the timestamps specified. requested_timestamps (list[int]): Timestamps for which interpolated timestamps are required. origin_timestamp (int): UNIX timestamp of origin frame. Poses will be reported relative to this frame. Returns: list[numpy.matrixlib.defmatrix.matrix]: SE3 matrix representing interpolated pose for each requested timestamp. Raises: ValueError: if pose_timestamps and abs_poses are not the same length ValueError: if pose_timestamps is not in ascending order """ requested_timestamps = requested_timestamps_.copy() requested_timestamps.insert(0, origin_timestamp) requested_timestamps = np.array(requested_timestamps) pose_timestamps = np.array(pose_timestamps) if len(pose_timestamps) != len(abs_poses): raise ValueError('Must supply same number of timestamps as poses') abs_quaternions = np.zeros((4, len(abs_poses))) abs_positions = np.zeros((3, len(abs_poses))) for i, pose in enumerate(abs_poses): if i > 0 and pose_timestamps[i - 1] >= pose_timestamps[i]: raise ValueError('Pose timestamps must be in ascending order') abs_quaternions[:, i] = _so3_to_quaternion(pose[0:3, 0:3]) abs_positions[:, i] = np.ravel(pose[0:3, 3]) upper_indices = [bisect.bisect(pose_timestamps, pt) for pt in requested_timestamps] if max(upper_indices) >= len(pose_timestamps): upper_indices = [min(i, len(pose_timestamps) - 1) for i in upper_indices] lower_indices = [u - 1 for u in upper_indices] fractions = (requested_timestamps - pose_timestamps[lower_indices]) / \ (pose_timestamps[upper_indices] - pose_timestamps[lower_indices]) quaternions_lower = abs_quaternions[:, lower_indices] quaternions_upper = abs_quaternions[:, upper_indices] d_array = (quaternions_lower * quaternions_upper).sum(0) linear_interp_indices = np.nonzero(d_array >= 1) sin_interp_indices = np.nonzero(d_array < 1) scale0_array = np.zeros(d_array.shape) scale1_array = np.zeros(d_array.shape) scale0_array[linear_interp_indices] = 1 - fractions[linear_interp_indices] scale1_array[linear_interp_indices] = fractions[linear_interp_indices] theta_array = np.arccos(np.abs(d_array[sin_interp_indices])) scale0_array[sin_interp_indices] = \ np.sin((1 - fractions[sin_interp_indices]) * theta_array) / np.sin(theta_array) scale1_array[sin_interp_indices] = \ np.sin(fractions[sin_interp_indices] * theta_array) / np.sin(theta_array) negative_d_indices = np.nonzero(d_array < 0) scale1_array[negative_d_indices] = -scale1_array[negative_d_indices] quaternions_interp = np.tile(scale0_array, (4, 1)) * quaternions_lower \ + np.tile(scale1_array, (4, 1)) * quaternions_upper positions_lower = abs_positions[:, lower_indices] positions_upper = abs_positions[:, upper_indices] positions_interp = np.multiply(np.tile((1 - fractions), (3, 1)), positions_lower) \ + np.multiply(np.tile(fractions, (3, 1)), positions_upper) poses_mat = np.zeros((4, 4 * len(requested_timestamps))) poses_mat[0, 0::4] = 1 - 2 * np.square(quaternions_interp[2, :]) - \ 2 * np.square(quaternions_interp[3, :]) poses_mat[0, 1::4] = 2 * np.multiply(quaternions_interp[1, :], quaternions_interp[2, :]) - \ 2 * np.multiply(quaternions_interp[3, :], quaternions_interp[0, :]) poses_mat[0, 2::4] = 2 * np.multiply(quaternions_interp[1, :], quaternions_interp[3, :]) + \ 2 * np.multiply(quaternions_interp[2, :], quaternions_interp[0, :]) poses_mat[1, 0::4] = 2 * np.multiply(quaternions_interp[1, :], quaternions_interp[2, :]) \ + 2 * np.multiply(quaternions_interp[3, :], quaternions_interp[0, :]) poses_mat[1, 1::4] = 1 - 2 * np.square(quaternions_interp[1, :]) \ - 2 * np.square(quaternions_interp[3, :]) poses_mat[1, 2::4] = 2 * np.multiply(quaternions_interp[2, :], quaternions_interp[3, :]) - \ 2 * np.multiply(quaternions_interp[1, :], quaternions_interp[0, :]) poses_mat[2, 0::4] = 2 * np.multiply(quaternions_interp[1, :], quaternions_interp[3, :]) - \ 2 * np.multiply(quaternions_interp[2, :], quaternions_interp[0, :]) poses_mat[2, 1::4] = 2 * np.multiply(quaternions_interp[2, :], quaternions_interp[3, :]) + \ 2 * np.multiply(quaternions_interp[1, :], quaternions_interp[0, :]) poses_mat[2, 2::4] = 1 - 2 * np.square(quaternions_interp[1, :]) - \ 2 * np.square(quaternions_interp[2, :]) poses_mat[0:3, 3::4] = positions_interp poses_mat[3, 3::4] = 1 if not absolute_poses: poses_mat = np.linalg.solve(poses_mat[0:4, 0:4], poses_mat) poses_out = [np.empty(0)] * (len(requested_timestamps) - 1) for i in range(1, len(requested_timestamps)): poses_out[i - 1] = np.asarray(poses_mat[0:4, i * 4:(i + 1) * 4]) return poses_out # Adapted from: # https://github.com/ori-mrg/robotcar-dataset-sdk/blob/master/python/transform.py def _so3_to_quaternion(so3): """Converts an SO3 rotation matrix to a quaternion Args: so3: 3x3 rotation matrix Returns: numpy.ndarray: quaternion [w, x, y, z] Raises: ValueError: if so3 is not 3x3 """ if so3.shape != (3, 3): raise ValueError('SO3 matrix must be 3x3') R_xx = so3[0, 0] R_xy = so3[0, 1] R_xz = so3[0, 2] R_yx = so3[1, 0] R_yy = so3[1, 1] R_yz = so3[1, 2] R_zx = so3[2, 0] R_zy = so3[2, 1] R_zz = so3[2, 2] try: w = sqrt(so3.trace() + 1) / 2 except ValueError: # w is non-real w = 0 # Due to numerical precision the value passed to `sqrt` may be a negative of the order 1e-15. # To avoid this error we clip these values to a minimum value of 0. x = sqrt(max(1 + R_xx - R_yy - R_zz, 0)) / 2 y = sqrt(max(1 + R_yy - R_xx - R_zz, 0)) / 2 z = sqrt(max(1 + R_zz - R_yy - R_xx, 0)) / 2 max_index = max(range(4), key=[w, x, y, z].__getitem__) if max_index == 0: x = (R_zy - R_yz) / (4 * w) y = (R_xz - R_zx) / (4 * w) z = (R_yx - R_xy) / (4 * w) elif max_index == 1: w = (R_zy - R_yz) / (4 * x) y = (R_xy + R_yx) / (4 * x) z = (R_zx + R_xz) / (4 * x) elif max_index == 2: w = (R_xz - R_zx) / (4 * y) x = (R_xy + R_yx) / (4 * y) z = (R_yz + R_zy) / (4 * y) elif max_index == 3: w = (R_yx - R_xy) / (4 * z) x = (R_zx + R_xz) / (4 * z) y = (R_yz + R_zy) / (4 * z) return np.array([w, x, y, z]) # ========================================================= # Use this function to save undistorted copies of the raw images provided in RobotCar def undistort_images(data_path_in: str, models_path: str) -> None: data_path_out = data_path_in data_path_in = data_path_in.rstrip('/') + '_distorted' os.rename(data_path_out, data_path_in) Path(data_path_out).mkdir(parents=True, exist_ok=True) model = CameraModel(models_path, data_path_in) image_files = sorted(x for x in Path(data_path_in).glob('*.png')) # The other photos are overexposed or taken when the car was not (yet) on the road image_files = image_files[1112:-147] num_workers = mp.cpu_count() - 1 with tqdm(total=len(image_files)) as pbar: with mp.Pool(num_workers) as pool: for _ in pool.imap_unordered( partial(_undistort, data_path_out=data_path_out, model=model), image_files): pbar.update(1) def _undistort(image_file: Path, data_path_out: str, model): new_image_file = Path(data_path_out) / image_file.name if not new_image_file.exists(): image = Image.fromarray(_load_image(str(image_file), model)) image.save(new_image_file) # Adapted from: # https://github.com/ori-mrg/robotcar-dataset-sdk/blob/master/python/image.py def _load_image(image_path, model=None, debayer=True): """Loads and rectifies an image from file. Args: image_path (str): path to an image from the dataset. model (camera_model.CameraModel): if supplied, model will be used to undistort image. Returns: numpy.ndarray: demosaiced and optionally undistorted image """ BAYER_STEREO = 'gbrg' BAYER_MONO = 'rggb' if model: camera = model.camera else: camera = re.search('(stereo|mono_(left|right|rear))', image_path).group(0) if camera == 'stereo': pattern = BAYER_STEREO else: pattern = BAYER_MONO img = Image.open(image_path) if debayer: img = demosaic(img, pattern) if model: img = model.undistort(img) return np.array(img).astype(np.uint8) # Adapted from: # https://github.com/ori-mrg/robotcar-dataset-sdk/blob/master/python/camera_model.py class CameraModel: """Provides intrinsic parameters and undistortion LUT for a camera. Attributes: camera (str): Name of the camera. camera sensor (str): Name of the sensor on the camera for multi-sensor cameras. focal_length (tuple[float]): Focal length of the camera in horizontal and vertical axis, in pixels. principal_point (tuple[float]): Principal point of camera for pinhole projection model, in pixels. G_camera_image (:obj: `numpy.matrixlib.defmatrix.matrix`): Transform from image frame to camera frame. bilinear_lut (:obj: `numpy.ndarray`): Look-up table for undistortion of images, mapping pixels in an undistorted image to pixels in the distorted image """ def __init__(self, models_dir, images_dir): """Loads a camera model from disk. Args: models_dir (str): directory containing camera model files. images_dir (str): directory containing images for which to read camera model. """ self.camera = None self.camera_sensor = None self.focal_length = None self.principal_point = None self.G_camera_image = None self.bilinear_lut = None self.__load_intrinsics(models_dir, images_dir) self.__load_lut(models_dir, images_dir) def project(self, xyz, image_size): """Projects a pointcloud into the camera using a pinhole camera model. Args: xyz (:obj: `numpy.ndarray`): 3xn array, where each column is (x, y, z) point relative to camera frame. image_size (tuple[int]): dimensions of image in pixels Returns: numpy.ndarray: 2xm array of points, where each column is the (u, v) pixel coordinates of a point in pixels. numpy.array: array of depth values for points in image. Note: Number of output points m will be less than or equal to number of input points n, as points that do not project into the image are discarded. """ if xyz.shape[0] == 3: xyz = np.stack((xyz, np.ones((1, xyz.shape[1])))) xyzw = np.linalg.solve(self.G_camera_image, xyz) # Find which points lie in front of the camera in_front = [i for i in range(0, xyzw.shape[1]) if xyzw[2, i] >= 0] xyzw = xyzw[:, in_front] uv = np.vstack((self.focal_length[0] * xyzw[0, :] / xyzw[2, :] + self.principal_point[0], self.focal_length[1] * xyzw[1, :] / xyzw[2, :] + self.principal_point[1])) in_img = [ i for i in range(0, uv.shape[1]) if 0.5 <= uv[0, i] <= image_size[1] and 0.5 <= uv[1, i] <= image_size[0] ] return uv[:, in_img], np.ravel(xyzw[2, in_img]) def undistort(self, image): """Undistorts an image. Args: image (:obj: `numpy.ndarray`): A distorted image. Must be demosaiced - ie. must be a 3-channel RGB image. Returns: numpy.ndarray: Undistorted version of image. Raises: ValueError: if image size does not match camera model. ValueError: if image only has a single channel. """ if image.shape[0] * image.shape[1] != self.bilinear_lut.shape[0]: raise ValueError('Incorrect image size for camera model') lut = self.bilinear_lut[:, 1::-1].T.reshape((2, image.shape[0], image.shape[1])) if len(image.shape) == 1: raise ValueError('Undistortion function only works with multi-channel images') undistorted = np.rollaxis( np.array([ map_coordinates(image[:, :, channel], lut, order=1) for channel in range(0, image.shape[2]) ]), 0, 3) return undistorted.astype(image.dtype) def __get_model_name(self, images_dir): self.camera = re.search('(stereo|mono_(left|right|rear))', images_dir).group(0) if self.camera == 'stereo': self.camera_sensor = re.search('(left|center_distorted|centre_distorted|right)', images_dir).group(0) if self.camera_sensor == 'left': return 'stereo_wide_left' if self.camera_sensor == 'right': return 'stereo_wide_right' if self.camera_sensor in ['center_distorted', 'centre_distorted']: return 'stereo_narrow_left' raise RuntimeError('Unknown camera model for given directory: ' + images_dir) return self.camera def __load_intrinsics(self, models_dir, images_dir): model_name = self.__get_model_name(images_dir) intrinsics_path = os.path.join(models_dir, model_name + '.txt') with open(intrinsics_path, 'r', encoding='utf-8') as intrinsics_file: vals = [float(x) for x in next(intrinsics_file).split()] self.focal_length = (vals[0], vals[1]) self.principal_point = (vals[2], vals[3]) G_camera_image = [] for line in intrinsics_file: G_camera_image.append([float(x) for x in line.split()]) self.G_camera_image = np.array(G_camera_image) def __load_lut(self, models_dir, images_dir): model_name = self.__get_model_name(images_dir) lut_path = os.path.join(models_dir, model_name + '_distortion_lut.bin') lut = np.fromfile(lut_path, np.double) lut = lut.reshape([2, lut.size // 2]) self.bilinear_lut = lut.transpose() # ========================================================= if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('img_path', type=str) parser.add_argument('models_path', type=str) args = parser.parse_args() undistort_images(args.img_path, args.models_path) # dataset = Robotcar('/home/voedisch/data/robotcar', # '2015-08-12-15-04-18', (0, -1, 1), (0, 1, 2, 3), # 192, # 640, # poses=True) # print(dataset[0].keys()) ================================================ FILE: datasets/utils.py ================================================ import pickle import random from abc import abstractmethod from os import PathLike from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union import matplotlib.pyplot as plt import numpy as np import torchvision.transforms.functional as F from torch import Tensor from torch.utils.data import Dataset as TorchDataset from torchvision import transforms from torchvision.transforms import Compose, Lambda class Dataset(TorchDataset): def __init__( self, data_path: Union[str, Path, PathLike], frame_ids: Union[List[int], Tuple[int, ...]], scales: Optional[Union[List[int], Tuple[int, ...]]] = None, height: Optional[int] = None, width: Optional[int] = None, do_augmentation: bool = False, views: Union[List[str], Tuple[str, ...]] = ('left', ), with_depth: bool = False, with_mask: bool = False, use_cache: bool = False, min_distance: float = 0, ) -> None: super().__init__() if any(v not in ['left', 'right'] for v in views): raise ValueError('views must be one of ["left", "right"]') if sum(x is None for x in [scales, height, width]) == 1: raise ValueError('Either none or all of [scales, height, width] must be omitted.') self.data_path = Path(data_path) self.frame_ids = sorted(frame_ids) self.scales = scales if scales is not None else () self.height = height self.width = width self.do_augmentation = do_augmentation self.views = tuple(set(views)) self.with_depth = with_depth self.with_mask = with_mask self.min_distance = min_distance # Minimum distance between poses [in meters] # If loading from the original source takes some time, we cache the extracted data self.use_cache = use_cache self.cache_file = None # Data augmentation self.brightness = (0.8, 1.2) self.contrast = (0.8, 1.2) self.saturation = (0.8, 1.2) self.hue = (-0.1, 0.1) # Precompute the resize functions for each scale relative to the previous scale # If scales is None, the size of the raw data will be used self.resize = {} for s in self.scales: exp_scale = 2**s self.resize[s] = transforms.Resize((self.height // exp_scale, self.width // exp_scale), interpolation=transforms.InterpolationMode.LANCZOS) self.sequence_indices = {} self.left_img_filenames = [] self.right_img_filenames = [] self.left_mask_filenames = [] self.right_mask_filenames = [] def _load_image_filenames(self) -> None: if 'left' in self.views: self.left_img_filenames = self._get_filenames(mode='rgb_left') if 'right' in self.views: self.right_img_filenames = self._get_filenames(mode='rgb_right') if len(self.views) == 2: assert len(self.left_img_filenames) == len(self.right_img_filenames) def _load_mask_filenames(self) -> None: if 'left' in self.views: self.left_mask_filenames = self._get_filenames(mode='mask_left') if 'right' in self.views: self.right_mask_filenames = self._get_filenames(mode='mask_right') if len(self.views) == 2: assert len(self.left_mask_filenames) == len(self.right_mask_filenames) def _len_frames(self) -> int: """ Subtract requested neighboring frames on either side. """ if 'left' in self.views: return len(self.left_img_filenames) - 2 * len(self.sequence_indices) return len(self.right_img_filenames) - 2 * len(self.sequence_indices) def __len__(self) -> int: """ Multiply by number of views to account for left and right images. """ return len(self.views) * self._len_frames() def _scale_camera_matrix(self, camera_matrix: np.ndarray, scale: int) -> Tuple[np.ndarray, np.ndarray]: scaled_camera_matrix = camera_matrix.copy() scaled_camera_matrix[0, :] *= self.width // (2**scale) scaled_camera_matrix[1, :] *= self.height // (2**scale) inv_scaled_camera_matrix = np.linalg.pinv(scaled_camera_matrix) return scaled_camera_matrix, inv_scaled_camera_matrix def _pre_getitem(self, index: int) -> Tuple[List[Path], List[Path], int, bool, bool]: if index < 0 or index >= self.__len__(): raise IndexError() if len(self.views) == 2: if index < self._len_frames(): img_filenames = self.left_img_filenames else: img_filenames = self.right_img_filenames index -= self._len_frames() elif self.views[0] == 'left': img_filenames = self.left_img_filenames else: img_filenames = self.right_img_filenames mask_filenames = [] if self.with_mask: if len(self.views) == 2: if index < self._len_frames(): mask_filenames = self.left_mask_filenames else: mask_filenames = self.right_mask_filenames index -= self._len_frames() elif self.views[0] == 'left': mask_filenames = self.left_mask_filenames else: mask_filenames = self.right_mask_filenames # Get number of shift indices # The formula is: index + 2*i + 1, where i is the i-th element of the ordered sequences for i, seq_indices in enumerate(self.sequence_indices.values()): if seq_indices[0] < index + 2 * i + 1 < seq_indices[1]: index += 2 * i + 1 break # Determine whether to apply data augmentation do_color_augmentation = self.do_augmentation and random.random() > .5 do_flip = self.do_augmentation and random.random() > .5 return img_filenames, mask_filenames, index, do_color_augmentation, do_flip def _post_getitem(self, item: Dict[Any, Any], do_color_augmentation: bool) -> None: # Resize images # Convert to list object as we are changing the size during the iteration for key in list(item.keys()): if 'rgb' in key or 'mask' in key: k, frame_id, _ = key for scale in self.scales: if scale == 0: continue item[(k, frame_id, scale)] = self.resize[scale](item[(k, frame_id, scale - 1)]) # Apply color augmentation if do_color_augmentation: color_augmentation = get_random_color_jitter(self.brightness, self.contrast, self.saturation, self.hue) self._preprocess(item, color_augmentation) else: self._preprocess(item, augment=(lambda x: x)) def _load_from_cache(self, cache_name: str) -> Union[None, Any]: if self.cache_file is None: raise RuntimeError('cache_file not set for this dataset.') # Separate cache files for the different names to avoid reading a large cache file cache_file = self.cache_file.parent / \ f'{self.cache_file.stem}_{cache_name}{self.cache_file.suffix}' if cache_file.exists(): with open(cache_file, 'rb') as f: data = pickle.load(f) return data return None def _save_to_cache(self, cache_name: str, data: Any, replace: bool = False) -> None: if self.cache_file is None: raise RuntimeError('cache_file not set for this dataset.') # Separate cache files for the different names to avoid reading a large cache file cache_file = self.cache_file.parent / \ f'{self.cache_file.stem}_{cache_name}{self.cache_file.suffix}' # Write new file or replace existing if not cache_file.exists() or replace: with open(cache_file, 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) @abstractmethod def _get_filenames(self, mode: str) -> List[Path]: raise NotImplementedError @abstractmethod def _preprocess( self, item: Dict[Any, Any], augment: Union[Callable, Tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]], ) -> None: raise NotImplementedError @abstractmethod def __getitem__(self, index: int) -> Dict[Any, Tensor]: raise NotImplementedError @staticmethod def _to_tensor(data) -> Tensor: return transforms.ToTensor()(data) def get_item_filenames(self, index: int): all_img_filenames, all_mask_filenames, index, _, _ = self._pre_getitem(index) img_filenames = [] mask_filenames = [] for frame_id in self.frame_ids: img_filenames.append(all_img_filenames[index + frame_id]) if all_mask_filenames: mask_filenames.append(all_mask_filenames[index + frame_id]) filenames = { 'index': index, 'images': img_filenames, 'masks': mask_filenames, } return filenames # ============================================================================= # Adapted from: # https://github.com/pytorch/vision/pull/3001#issuecomment-814919958 def get_random_color_jitter( brightness: Optional[Tuple[float, float]] = None, contrast: Optional[Tuple[float, float]] = None, saturation: Optional[Tuple[float, float]] = None, hue: Optional[Tuple[float, float]] = None, ) -> Compose: transforms_ = [] if brightness is not None: brightness_factor = random.uniform(brightness[0], brightness[1]) transforms_.append(Lambda(lambda img: F.adjust_brightness(img, brightness_factor))) if contrast is not None: contrast_factor = random.uniform(contrast[0], contrast[1]) transforms_.append(Lambda(lambda img: F.adjust_contrast(img, contrast_factor))) if saturation is not None: saturation_factor = random.uniform(saturation[0], saturation[1]) transforms_.append(Lambda(lambda img: F.adjust_saturation(img, saturation_factor))) if hue is not None: hue_factor = random.uniform(hue[0], hue[1]) transforms_.append(Lambda(lambda img: F.adjust_hue(img, hue_factor))) random.shuffle(transforms_) transforms_ = Compose(transforms_) return transforms_ # ============================================================================= def augment_data(sample): # Data augmentation brightness = (0.8, 1.2) contrast = (0.8, 1.2) saturation = (0.8, 1.2) hue = (-0.1, 0.1) augmentation = get_random_color_jitter(brightness, contrast, saturation, hue) augmented_sample = {} for key, value in sample.items(): if 'rgb_aug' in key: augmented_sample[key] = transforms.ToTensor()(augmentation(transforms.ToPILImage()( value[0, ...]).convert('RGB'))).unsqueeze(0) else: augmented_sample[key] = value.detach().clone() return augmented_sample # ============================================================================= def show_images(batch, scales=(0, 1, 2, 3), frames=(-1, 0, 1), augmented=False): batch_size = batch['index'].shape[0] num_frames = len(frames) for s in scales: fig, axs = plt.subplots(nrows=batch_size, ncols=num_frames, figsize=(10, 18 / 15 * batch_size)) axs = axs.reshape(batch_size, num_frames) for b in range(batch_size): for f in frames: if augmented: axs[b, f + 1].imshow(batch['rgb_aug', f, s][b, :, :, :].permute(1, 2, 0)) else: axs[b, f + 1].imshow(batch['rgb', f, s][b, :, :, :].permute(1, 2, 0)) axs[b, f + 1].axis('off') if f != -1: axs[b, f + 1].set_title(f'{batch["relative_distance", f][b]:.2f}') fig.tight_layout() fig.show() plt.close(fig) ================================================ FILE: depth_pose_prediction/__init__.py ================================================ import depth_pose_prediction.utils from depth_pose_prediction.config import DepthPosePrediction as Config from depth_pose_prediction.depth_pose_prediction import DepthPosePrediction ================================================ FILE: depth_pose_prediction/config.py ================================================ import dataclasses from pathlib import Path from typing import Optional, Tuple, Union @dataclasses.dataclass class DepthPosePrediction: config_file: Path train_set: Optional[Union[Tuple[int, ...], int, str]] val_set: Optional[Union[Tuple[int, ...], Tuple[str, ...], int, str]] resnet_depth: int resnet_pose: int resnet_pretrained: bool scales: Tuple[int, ...] learning_rate: float scheduler_step_size: int batch_size: int num_workers: int num_epochs: int min_depth: Optional[float] max_depth: Optional[float] disparity_smoothness: float velocity_loss_scaling: Optional[float] mask_dynamic: bool log_path: Path save_frequency: int save_val_depth: bool save_val_depth_batches: int multiple_gpus: bool gpu_ids: Optional[Tuple[int, ...]] load_weights_folder: Optional[Path] use_wandb: Optional[bool] ================================================ FILE: depth_pose_prediction/depth_pose_prediction.py ================================================ import shutil import warnings from pathlib import Path from typing import Any, Dict, Optional, Tuple, Union import cv2 import matplotlib.pyplot as plt import numpy as np import torch import torch.nn.functional as F import wandb from PIL import Image from torch import Tensor, nn, optim from torch.utils.data import DataLoader from tqdm import tqdm from datasets import Cityscapes from datasets import Config as DatasetConfig from datasets import Kitti, Robotcar from depth_pose_prediction.config import DepthPosePrediction as Config from depth_pose_prediction.networks import ( SSIM, BackprojectDepth, DepthDecoder, PoseDecoder, Project3D, ResnetEncoder, ) from depth_pose_prediction.utils import ( disp_to_depth, h_concat_images, transformation_from_parameters, ) # matplotlib.use('Agg') class DepthPosePrediction: def __init__(self, dataset_config: DatasetConfig, config: Config, use_online: bool = False): # Initialize parameters =========================== self.config_file = config.config_file self.dataset_type = dataset_config.dataset self.dataset_path = dataset_config.dataset_path self.height = dataset_config.height self.width = dataset_config.width self.train_set = config.train_set self.val_set = config.val_set self.resnet_depth = config.resnet_depth self.resnet_pose = config.resnet_pose self.resnet_pretrained = config.resnet_pretrained self.scales = config.scales self.learning_rate = config.learning_rate self.scheduler_step_size = config.scheduler_step_size self.batch_size = config.batch_size self.num_workers = config.num_workers self.num_epochs = config.num_epochs self.min_depth = config.min_depth self.max_depth = config.max_depth self.disparity_smoothness = config.disparity_smoothness self.velocity_loss_scaling = config.velocity_loss_scaling self.mask_dynamic = config.mask_dynamic self.log_path = config.log_path self.save_frequency = config.save_frequency self.save_val_depth = config.save_val_depth self.save_val_depth_batches = config.save_val_depth_batches self.multiple_gpus = config.multiple_gpus self.gpu_ids = config.gpu_ids self.load_weights_folder = config.load_weights_folder self.use_wandb = False # Internal parameters ============================= self.is_trained = False # Fixed parameters ================================ self.frame_ids = (0, -1, 1) self.num_pose_frames = 2 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Deal with dependent parameters ================== if self.load_weights_folder is not None: self.load_weights_folder = Path(self.load_weights_folder).absolute() if isinstance(self.train_set, list): self.train_set = tuple(self.train_set) if isinstance(self.val_set, list): self.val_set = tuple(self.val_set) if dataset_config.dataset == 'Kitti': if isinstance(self.val_set, int): self.val_set = (self.val_set,) if isinstance(self.train_set, str): if self.train_set != 'all': raise ValueError('train_set of KITTI only accepts these strings: ["all"]') self.train_set = tuple([ # pylint: disable=consider-using-generator s for s in range(11) if s not in self.val_set and s != 3 # No IMU for seq 3 ]) elif isinstance(self.train_set, int): self.train_set = (self.train_set,) if not (isinstance(self.train_set, tuple) and isinstance(self.train_set[0], int)): raise ValueError('Passed invalid value for train_set') if not (isinstance(self.val_set, tuple) and isinstance(self.val_set[0], int)): raise ValueError('Passed invalid value for val_set') elif dataset_config.dataset in ['Cityscapes', 'RobotCar']: if isinstance(self.train_set, str): self.train_set = (self.train_set,) if isinstance(self.val_set, str): self.val_set = (self.val_set,) if not (isinstance(self.train_set, tuple) and isinstance(self.train_set[0], str)): raise ValueError('Passed invalid value for train_set') if not (isinstance(self.val_set, tuple) and isinstance(self.val_set[0], str)): raise ValueError('Passed invalid value for val_set') if self.multiple_gpus and not torch.cuda.is_available(): raise ValueError('Activated multiple GPUs but running on a CPU.') if self.multiple_gpus and self.gpu_ids is None: self.gpu_ids = list(range(torch.cuda.device_count())) elif self.multiple_gpus: if any(i >= torch.cuda.device_count() for i in self.gpu_ids): raise ValueError('Passed invalid GPU ID.') if not self.multiple_gpus and self.gpu_ids is not None and len(self.gpu_ids) > 1: raise ValueError('Passed multiple GPU IDs without activating multi-GPU support.') if self.gpu_ids is not None and torch.cuda.is_available(): # Set the main GPU for gradient averaging etc. torch.cuda.set_device(self.gpu_ids[0]) elif self.gpu_ids is None and torch.cuda.is_available(): self.gpu_ids = (0,) # ================================================= # Construct networks ============================== self.models = {} self.models['depth_encoder'] = ResnetEncoder(self.resnet_depth, self.resnet_pretrained) self.models['depth_decoder'] = DepthDecoder(self.models['depth_encoder'].num_ch_encoder, self.scales) self.models['pose_encoder'] = ResnetEncoder(self.resnet_pose, self.resnet_pretrained, self.num_pose_frames) self.models['pose_decoder'] = PoseDecoder(self.models['pose_encoder'].num_ch_encoder, num_input_features=1, num_frames_to_predict_for=2) self.use_online = use_online self.online_models = {} if self.use_online: self.online_models['depth_encoder'] = ResnetEncoder(self.resnet_depth, self.resnet_pretrained) self.online_models['depth_decoder'] = DepthDecoder( self.online_models['depth_encoder'].num_ch_encoder, self.scales) self.online_models['pose_encoder'] = ResnetEncoder(self.resnet_pose, self.resnet_pretrained, self.num_pose_frames) self.online_models['pose_decoder'] = PoseDecoder( self.models['pose_encoder'].num_ch_encoder, num_input_features=1, num_frames_to_predict_for=2) # ================================================= # Create the projected (warped) image ============= self.backproject_depth = {} self.project_3d = {} self.backproject_depth_single = {} self.project_3d_single = {} for scale in self.scales: h = self.height // (2 ** scale) w = self.width // (2 ** scale) self.backproject_depth[scale] = BackprojectDepth(self.batch_size, h, w) self.project_3d[scale] = Project3D(self.batch_size, h, w) self.backproject_depth_single[scale] = BackprojectDepth(1, h, w) self.project_3d_single[scale] = Project3D(1, h, w) # ================================================= # Structural similarity =========================== self.ssim = SSIM() self.ssim.to(self.device) # ================================================= # Send everything to the GPU(s) =================== if 'cuda' in self.device.type: print(f'Selected GPUs: {list(self.gpu_ids)}') if self.multiple_gpus: for model_name, m in self.models.items(): if m is not None: self.models[model_name] = nn.DataParallel(m, device_ids=self.gpu_ids) self.parameters_to_train = [] for model_name, m in self.models.items(): if m is not None: m.to(self.device) self.parameters_to_train += list(m.parameters()) self.online_parameters_to_train = [] for m in self.online_models.values(): m.to(self.device) self.online_parameters_to_train += list(m.parameters()) for m in self.backproject_depth.values(): m.to(self.device) for m in self.project_3d.values(): m.to(self.device) for m in self.backproject_depth_single.values(): m.to(self.device) for m in self.project_3d_single.values(): m.to(self.device) # ================================================= # Set up optimizer ================================ self.optimizer = optim.Adam(self.parameters_to_train, self.learning_rate) self.lr_scheduler = optim.lr_scheduler.StepLR(self.optimizer, self.scheduler_step_size, 0.1) self.epoch = 0 if use_online: self.online_optimizer = optim.Adam(self.online_parameters_to_train, self.learning_rate) else: self.online_optimizer = None # ================================================= # Construct datasets ============================== self.train_loader, self.val_loader = None, None # ================================================= # ============================================================ # Training / validation def train(self, validate: bool = False, depth_error: bool = False, pose_error: bool = False, dataloader: Optional[DataLoader] = None, verbose: bool = True, use_wandb: Optional[bool] = False) -> None: use_wandb = False if use_wandb is None else use_wandb if use_wandb: self.use_wandb = True self._init_wandb() if dataloader is None: self._create_dataloaders(validation=validate) else: validate = False self.train_loader = dataloader print(f'Training samples: {len(self.train_loader):>5}') # Training loop step = 0 starting_epoch = self.epoch + 1 for self.epoch in range(starting_epoch, self.num_epochs + 1): # Run a single epoch self._set_train() loss = [] with tqdm(unit='batches', total=len(self.train_loader), desc=f'Training epoch {self.epoch}/{self.num_epochs}', disable=not verbose) as pbar: for batch_i, sample_i in enumerate(self.train_loader): # Run a single step self.optimizer.zero_grad() outputs, losses = self._process_batch(sample_i) loss.append(losses['loss'].item()) losses['loss'].backward() self.optimizer.step() if self.use_wandb: wandb.log(losses) pbar.set_postfix(loss=np.mean(loss)) pbar.update(1) step += 1 self.lr_scheduler.step() self.is_trained = True if self.save_frequency > 0 and self.epoch % self.save_frequency == 0: self.save_model() if validate: validation_loss = self.validate() if self.use_wandb: wandb.log({'validation_loss': validation_loss}, commit=False) if depth_error: error = self.compute_depth_error(median_scaling=True, print_results=False) if self.use_wandb: wandb.log(error, commit=False) if pose_error: error = self.compute_pose_error(print_results=False) if self.use_wandb: wandb.log(error, commit=False) if self.use_wandb: wandb.log({'training_loss': np.mean(loss), 'epoch': self.epoch}) # Save the final model if self.save_frequency > -1: self.save_model() def adapt(self, online_data: Dict[Any, Tensor], training_data: Optional[Dict[Any, Tensor]] = None, online_index: int = 0, steps: int = 1, online_loss_weight: Optional[float] = None): if online_loss_weight is None: loss_weights = None elif self.batch_size == 1: loss_weights = torch.ones(1, device=self.device) else: loss_weights = torch.empty(self.batch_size, device=self.device) buffer_loss_weight = (1 - online_loss_weight) / (self.batch_size - 1) loss_weights[online_index] = online_loss_weight loss_weights[np.arange(self.batch_size) != online_index] = buffer_loss_weight if training_data is not None: self._set_adapt(freeze_encoder=True) for _ in range(steps): outputs_eval, losses = self._process_batch(training_data, loss_weights) self.optimizer.zero_grad() losses['loss'].backward() self.optimizer.step() else: self._set_eval() with torch.no_grad(): outputs_eval, losses = self._process_batch(online_data, loss_weights) return outputs_eval, losses def validate(self) -> float: """ Compute the validation loss(es) """ if not self.is_trained: warnings.warn('The model has not been trained yet.', RuntimeWarning) if self.val_loader is None: self._create_dataloaders(training=False) self._set_eval() loss = [] with torch.no_grad(), tqdm(unit='batches', total=len(self.val_loader), desc='Validation') as pbar: for batch_i, sample_i in enumerate(self.val_loader): outputs, losses = self._process_batch(sample_i) loss.append(losses['loss'].item()) if self.save_val_depth and batch_i < self.save_val_depth_batches: self.save_prediction(sample_i, outputs) pbar.set_postfix(loss=np.mean(loss)) pbar.update(1) return float(np.mean(loss)) def compute_depth_error( self, median_scaling: bool = True, print_results: bool = True, ) -> Dict[str, float]: """ Compute error metrics for depth prediction Follows monodepth2 implementation: https://github.com/nianticlabs/monodepth2/blob/master/evaluate_depth.py monodepth2 on Kittti - cap depth at 80 per standard practice - per-image median ground truth scaling (or same for entire test set) - post-process: predict for original and flipped images, then combine both disparities - pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) """ if not self.is_trained: warnings.warn('The model has not been trained yet.', RuntimeWarning) if self.dataset_type == 'Kitti': dataset = Kitti(self.dataset_path, self.val_set, frame_ids=[0], scales=[0], height=self.height, width=self.width, with_depth=True) elif self.dataset_type == 'Cityscapes': dataset = Cityscapes(self.dataset_path, self.val_set, frame_ids=[0], scales=[0], height=self.height, width=self.width, with_depth=True) else: warnings.warn(f'Unsupported dataset: {self.dataset_type}', RuntimeWarning) return {} data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=self.num_workers, pin_memory=True, drop_last=True) print(f'Validation samples: {len(dataset):>5}') ratios = [] num_samples = 0 abs_diff, abs_rel, sq_rel, a1, a2, a3, rmse_tot, rmse_log_tot = 0, 0, 0, 0, 0, 0, 0, 0 self._set_eval() with torch.no_grad(), tqdm(unit='batches', total=len(data_loader), desc='Validation') as pbar: for batch_i, sample_i in enumerate(data_loader): for key, val in sample_i.items(): sample_i[key] = val.to(self.device) gt_depth = sample_i[('depth', 0, -1)].squeeze().cpu().detach().numpy() gt_height, gt_width = gt_depth.shape # Depth prediction outputs = self._predict_disparity(sample_i) disparity = outputs[('disp', 0)].squeeze().cpu().detach().numpy() pred_depth = disp_to_depth(disparity, self.min_depth, None) # pylint: disable-next=no-member pred_depth = cv2.resize(pred_depth, (gt_width, gt_height)) # Mask out pixels without ground truth depth # or ground truth depth farther away than the maximum predicted depth if self.max_depth is not None: mask = np.logical_and(gt_depth > self.min_depth, gt_depth < self.max_depth) else: mask = gt_depth > self.min_depth pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] # Introduced by SfMLearner if median_scaling: ratio = np.median(gt_depth) / np.median(pred_depth) ratios.append(ratio) pred_depth *= ratio # Cap predicted depth at min and max depth pred_depth[pred_depth < self.min_depth] = self.min_depth if self.max_depth is not None: pred_depth[pred_depth > self.max_depth] = self.max_depth # Compute error metrics thresh = np.maximum((gt_depth / pred_depth), (pred_depth / gt_depth)) a1 += np.mean(thresh < 1.25) a2 += np.mean(thresh < 1.25 ** 2) a3 += np.mean(thresh < 1.25 ** 3) rmse = (gt_depth - pred_depth) ** 2 rmse_tot += np.sqrt(np.mean(rmse)) rmse_log = (np.log(gt_depth) - np.log(pred_depth)) ** 2 rmse_log_tot += np.sqrt(np.mean(rmse_log)) abs_diff += np.mean(np.abs(gt_depth - pred_depth)) abs_rel += np.mean(np.abs(gt_depth - pred_depth) / gt_depth) sq_rel += np.mean(((gt_depth - pred_depth) ** 2) / gt_depth) num_samples += 1 pbar.update(1) metrics = { 'abs_diff': abs_diff / num_samples, 'abs_rel': abs_rel / num_samples, 'sq_rel': sq_rel / num_samples, 'a1': a1 / num_samples, 'a2': a2 / num_samples, 'a3': a3 / num_samples, 'rmse': rmse_tot / num_samples, 'rmse_log': rmse_log_tot / num_samples } if print_results: for key, value in metrics.items(): print(f'{key:<8}: {value:>6.3f}') if median_scaling: ratios = np.array(ratios) med = np.median(ratios) metrics['med_scaling'] = med if print_results: print(f'Scaling ratios | med: {med:.3f} | std: {np.std(ratios / med):.3f}') return metrics def compute_pose_error(self, print_results: bool = True) -> Dict[str, float]: if not self.is_trained: warnings.warn('The model has not been trained yet.', RuntimeWarning) if self.dataset_type == 'Kitti': dataset = Kitti(self.dataset_path, self.val_set, frame_ids=[-1, 0], scales=[0], height=self.height, width=self.width, poses=True, with_depth=True) else: warnings.warn(f'Unsupported dataset: {self.dataset_type}', RuntimeWarning) return {} data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=self.num_workers, pin_memory=True, drop_last=True) print(f'Validation samples: {len(dataset):>5}') num_samples = 0 rpe_trans, rpe_rot = 0, 0 self._set_eval() with torch.no_grad(), tqdm(unit='batches', total=len(data_loader), desc='Validation') as pbar: for batch_i, sample_i in enumerate(data_loader): image_0 = sample_i['rgb_aug', -1, 0] image_1 = sample_i['rgb_aug', 0, 0] pred_transformation, _ = self.predict_pose(image_0, image_1, as_numpy=False) pred_transformation = pred_transformation.squeeze().detach().cpu() pred_transformation = torch.linalg.inv(pred_transformation) gt_transformation = sample_i['relative_pose', 0].squeeze() rel_err = torch.linalg.inv(gt_transformation) @ pred_transformation trans_error = torch.linalg.norm(rel_err[:3, 3]).item() rot_error = np.arccos( max(min(.5 * (torch.trace(rel_err[:3, :3]).item() - 1), 1.), -1.0)) rpe_trans += trans_error rpe_rot += rot_error * 180 / np.pi num_samples += 1 pbar.update(1) metrics = {'rpe_trans': rpe_trans / num_samples, 'rpe_rot': rpe_rot / num_samples} if print_results: for key, value in metrics.items(): print(f'{key:<8}: {value:>6.3f}') return metrics # ============================================================ # Predict functions def predict(self, batch) -> Dict[Any, Tensor]: if not self.is_trained: warnings.warn('The model has not been trained yet.', RuntimeWarning) self._set_eval() with torch.no_grad(): outputs, losses = self._process_batch(batch) return outputs def predict_from_image(self, image, as_numpy: bool = True): """ Take one image as input and return the predicted depth """ if not self.is_trained: warnings.warn('The model has not been trained yet.', RuntimeWarning) self._set_eval() with torch.no_grad(): image = image.to(self.device) # Depth network features = self.models['depth_encoder'](image) disp = self.models['depth_decoder'](features)[('disp', 0)] depth = disp_to_depth(disp, self.min_depth, self.max_depth) if as_numpy: depth = depth.squeeze().cpu().detach().numpy() return depth def predict_from_images( self, image_0: Tensor, image_1: Tensor, as_numpy: bool = True, return_loss: bool = False, camera_matrix: Optional[Tensor] = None, inv_camera_matrix: Optional[Tensor] = None, relative_distance: Optional[Tensor] = None, ): """ Take two images as input and return depth for both and relative pose """ if not self.is_trained: warnings.warn('The model has not been trained yet.', RuntimeWarning) if len(image_0.shape) == 3: image_0 = image_0.unsqueeze(dim=0) if len(image_1.shape) == 3: image_1 = image_1.unsqueeze(dim=0) self._set_eval() with torch.no_grad(): image_0 = image_0.to(self.device) image_1 = image_1.to(self.device) # Depth network features_0 = self.models['depth_encoder'](image_0) disp_0 = self.models['depth_decoder'](features_0) depth_0 = disp_to_depth(disp_0[('disp', 0)], self.min_depth, self.max_depth) features_1 = self.models['depth_encoder'](image_1) disp_1 = self.models['depth_decoder'](features_1) depth_1 = disp_to_depth(disp_1[('disp', 0)], self.min_depth, self.max_depth) # Pose network pose_inputs = torch.cat([image_0, image_1], 1) pose_features = [self.models['pose_encoder'](pose_inputs)] axis_angle, translation = self.models['pose_decoder'](pose_features) transformation = transformation_from_parameters(axis_angle[:, 0], translation[:, 0], invert=False) if as_numpy: depth_0 = depth_0.squeeze().cpu().detach().numpy() depth_1 = depth_1.squeeze().cpu().detach().numpy() transformation = transformation.squeeze().cpu().detach().numpy() if return_loss: # Assume: image_0 => frame -1 | image_1 => frame 0 frame_ids = self.frame_ids self.frame_ids = (0, -1) outputs = disp_1 outputs[('axis_angle', 0, -1)] = axis_angle[:, 0] outputs[('translation', 0, -1)] = translation[:, 0] outputs[('cam_T_cam', 0, -1)] = transformation_from_parameters(axis_angle[:, 0], translation[:, 0], invert=True) inputs = { ('rgb', -1, 0): image_0, ('rgb', 0, 0): image_1, ('camera_matrix', 0): camera_matrix.to(self.device), ('inv_camera_matrix', 0): inv_camera_matrix.to(self.device), ('relative_distance', 0): relative_distance.to(self.device) } self._reconstruct_images(inputs, outputs) losses = self._compute_loss(inputs, outputs, scales=(0,)) for k, v in losses.items(): losses[k] = v.squeeze().cpu().detach().numpy() self.frame_ids = frame_ids return depth_0, depth_1, transformation, losses return depth_0, depth_1, transformation def predict_pose( self, image_0: Tensor, image_1: Tensor, as_numpy: bool = True, use_online: bool = False, ) -> Tuple[Union[Tensor, np.ndarray], Union[Tensor, np.ndarray]]: if not self.is_trained: warnings.warn('The model has not been trained yet.', RuntimeWarning) if len(image_0.shape) == 3: image_0 = image_0.unsqueeze(dim=0) if len(image_1.shape) == 3: image_1 = image_1.unsqueeze(dim=0) self._set_eval() with torch.no_grad(): image_0 = image_0.to(self.device) image_1 = image_1.to(self.device) # Pose network pose_inputs = torch.cat([image_0, image_1], 1) if use_online: pose_features = [self.online_models['pose_encoder'](pose_inputs)] axis_angle, translation = self.online_models['pose_decoder'](pose_features) else: pose_features = [self.models['pose_encoder'](pose_inputs)] axis_angle, translation = self.models['pose_decoder'](pose_features) axis_angle, translation = axis_angle[:, 0], translation[:, 0] transformation = transformation_from_parameters(axis_angle, translation, invert=False) cov_matrix = torch.eye(6, device=self.device) if as_numpy: transformation = transformation.squeeze().cpu().detach().numpy() cov_matrix = cov_matrix.cpu().detach().numpy() return transformation, cov_matrix # ============================================================ # Save / load functions def save_model(self) -> None: """Save model weights to disk """ save_folder = self.log_path / 'models' / f'weights_{self.epoch:03}' save_folder.mkdir(parents=True, exist_ok=True) # Save the network weights for model_name, model in self.models.items(): if model is None: continue save_path = save_folder / f'{model_name}.pth' if isinstance(model, nn.DataParallel): to_save = model.module.state_dict() else: to_save = model.state_dict() if 'encoder' in model_name: # ToDo: look into this # Save the sizes - these are needed at prediction time to_save['height'] = Tensor(self.height) to_save['width'] = Tensor(self.width) torch.save(to_save, save_path) # Save the optimizer and the LR scheduler optimizer_save_path = save_folder / 'optimizer.pth' to_save = { 'optimizer': self.optimizer.state_dict(), 'scheduler': self.lr_scheduler.state_dict() } torch.save(to_save, optimizer_save_path) # Save the config file config_save_path = self.log_path / 'config.yaml' shutil.copy(self.config_file, config_save_path) print(f'Saved model to: {save_folder}') def load_model(self, load_optimizer: bool = True) -> None: """Load model(s) from disk """ if self.load_weights_folder is None: print('Weights folder required to load the model is not specified.') if not self.load_weights_folder.exists(): print(f'Cannot find folder: {self.load_weights_folder}') print(f'Load model from: {self.load_weights_folder}') # Load the network weights for model_name, model in self.models.items(): if model is None: continue path = self.load_weights_folder / f'{model_name}.pth' pretrained_dict = torch.load(path, map_location=self.device) if isinstance(model, nn.DataParallel): model_dict = model.module.state_dict() else: model_dict = model.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} if len(pretrained_dict.keys()) == 0: raise RuntimeError(f'No fitting weights found in: {path}') model_dict.update(pretrained_dict) if isinstance(model, nn.DataParallel): model.module.load_state_dict(model_dict) else: model.load_state_dict(model_dict) self.is_trained = True if load_optimizer: # Load the optimizer and LR scheduler optimizer_load_path = self.load_weights_folder / 'optimizer.pth' try: optimizer_dict = torch.load(optimizer_load_path, map_location=self.device) if 'optimizer' in optimizer_dict: self.optimizer.load_state_dict(optimizer_dict['optimizer']) self.lr_scheduler.load_state_dict(optimizer_dict['scheduler']) self.epoch = self.lr_scheduler.last_epoch print(f'Restored optimizer and LR scheduler (resume from epoch {self.epoch}).') else: self.optimizer.load_state_dict(optimizer_dict) print('Restored optimizer (legacy mode).') except: # pylint: disable=bare-except print('Cannot find matching optimizer weights, so the optimizer is randomly ' 'initialized.') def load_online_model(self, load_optimizer: bool = True) -> None: """Load model(s) from disk """ if self.load_weights_folder is None: print('Weights folder required to load the model is not specified.') if not self.load_weights_folder.exists(): print(f'Cannot find folder: {self.load_weights_folder}') print(f'Load online model from: {self.load_weights_folder}') # Load the network weights for model_name, model in self.online_models.items(): if model is None: continue path = self.load_weights_folder / f'{model_name}.pth' pretrained_dict = torch.load(path, map_location=self.device) if isinstance(model, nn.DataParallel): model_dict = model.module.state_dict() else: model_dict = model.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} if len(pretrained_dict.keys()) == 0: raise RuntimeError(f'No fitting weights found in: {path}') model_dict.update(pretrained_dict) if isinstance(model, nn.DataParallel): model.module.load_state_dict(model_dict) else: model.load_state_dict(model_dict) if load_optimizer: # Load the optimizer and LR scheduler optimizer_load_path = self.load_weights_folder / 'optimizer.pth' try: optimizer_dict = torch.load(optimizer_load_path, map_location=self.device) if 'optimizer' in optimizer_dict: self.online_optimizer.load_state_dict(optimizer_dict['optimizer']) print('Restored online optimizer') else: self.online_optimizer.load_state_dict(optimizer_dict) print('Restored online optimizer (legacy mode).') except: # pylint: disable=bare-except print('Cannot find matching optimizer weights, so the optimizer is randomly ' 'initialized.') # ============================================================ # Auxiliary functions def _set_train(self) -> None: for m in self.models.values(): if m is not None: m.train() def _set_eval(self) -> None: for m in self.models.values(): if m is not None: m.eval() def _set_adapt(self, freeze_encoder: bool = True) -> None: """ Set all to train except for batch normalization (freeze parameters) Convert all models to adaptation mode: batch norm is in eval mode + frozen params Adapted from: https://github.com/Yevkuzn/CoMoDA/blob/main/code/CoMoDA.py """ for model_name, model in self.models.items(): model.eval() # To set the batch norm to eval mode for name, param in model.named_parameters(): if name.find('bn') != -1: param.requires_grad = False # Freeze batch norm if freeze_encoder and 'encoder' in model_name: param.requires_grad = False for model_name, model in self.online_models.items(): model.eval() # To set the batch norm to eval mode for name, param in model.named_parameters(): if name.find('bn') != -1: param.requires_grad = False # Freeze batch norm if freeze_encoder and 'encoder' in model_name: param.requires_grad = False def _create_dataloaders(self, training: bool = True, validation: bool = True): valid_dataset_types = ['Kitti', 'Cityscapes', 'RobotCar'] if self.dataset_type not in valid_dataset_types: raise ValueError(f'dataset_type must be one of {valid_dataset_types}') if self.train_set is not None and training: if self.dataset_type == 'Kitti': train_dataset = Kitti(self.dataset_path, self.train_set, self.frame_ids, self.scales, self.height, self.width, do_augmentation=True, views=('left', 'right'), with_mask=self.mask_dynamic) elif self.dataset_type == 'Cityscapes': train_dataset = Cityscapes(self.dataset_path, self.train_set, self.frame_ids, self.scales, self.height, self.width, do_augmentation=True, with_mask=self.mask_dynamic) else: # self.dataset_type == 'RobotCar': train_dataset = Robotcar(self.dataset_path, self.train_set, self.frame_ids, self.scales, self.height, self.width, do_augmentation=True, with_mask=self.mask_dynamic, start_frame=4000, end_frame=24000) print(f'Training samples: {len(train_dataset):>5}') self.train_loader = DataLoader(train_dataset, self.batch_size, shuffle=True, num_workers=self.num_workers, pin_memory=True, drop_last=True) if self.val_set is not None and validation: if self.dataset_type == 'Kitti': val_dataset = Kitti(self.dataset_path, self.val_set, self.frame_ids, self.scales, self.height, self.width, with_mask=self.mask_dynamic) elif self.dataset_type == 'Cityscapes': val_dataset = Cityscapes(self.dataset_path, self.val_set, self.frame_ids, self.scales, self.height, self.width, with_mask=self.mask_dynamic) else: # self.dataset_type == 'RobotCar': val_dataset = Robotcar(self.dataset_path, self.val_set, self.frame_ids, self.scales, self.height, self.width, with_mask=self.mask_dynamic, start_frame=500, end_frame=4000) print(f'Validation samples: {len(val_dataset):>5}') self.val_loader = DataLoader(val_dataset, self.batch_size, shuffle=False, num_workers=self.num_workers, pin_memory=True, drop_last=True) def _process_batch( self, inputs: Dict[Any, Tensor], loss_sample_weights: Optional[Tensor] = None, use_online: bool = False, ) -> Tuple[Dict[Any, Tensor], Dict[str, Tensor]]: """ Pass a minibatch through the network """ for key, val in inputs.items(): inputs[key] = val.to(self.device) outputs = {} outputs.update(self._predict_disparity(inputs, use_online=use_online)) outputs.update(self._predict_poses(inputs, use_online=use_online)) self._reconstruct_images(inputs, outputs) # also converts disparity to depth losses = self._compute_loss(inputs, outputs, sample_weights=loss_sample_weights) return outputs, losses def _predict_disparity(self, inputs: Dict[Any, Tensor], frame: int = 0, scale: int = 0, use_online: bool = False) -> Dict[Any, Tensor]: if use_online: features = self.online_models['depth_encoder'](inputs[('rgb_aug', frame, scale)]) outputs = self.online_models['depth_decoder'](features) else: features = self.models['depth_encoder'](inputs[('rgb_aug', frame, scale)]) outputs = self.models['depth_decoder'](features) return outputs def _predict_poses(self, inputs: Dict[Any, Tensor], use_online: bool = False) -> Dict[Any, Tensor]: """ Predict the poses: 0 -> -1 and 0 -> 1 """ assert self.num_pose_frames == 2, self.num_pose_frames assert self.frame_ids == (0, -1, 1) outputs = {} pose_inputs_dict = {f_i: inputs['rgb_aug', f_i, 0] for f_i in self.frame_ids} for frame_i in self.frame_ids[1:]: # To maintain ordering we always pass frames in temporal order if frame_i < 0: pose_inputs = [pose_inputs_dict[frame_i], pose_inputs_dict[0]] else: pose_inputs = [pose_inputs_dict[0], pose_inputs_dict[frame_i]] pose_inputs = torch.cat(pose_inputs, 1) if use_online: pose_features = [self.online_models['pose_encoder'](pose_inputs)] axis_angle, translation = self.online_models['pose_decoder'](pose_features) else: if self.models['pose_encoder'] is None: axis_angle, translation = self.models['pose_decoder'](pose_inputs) else: pose_features = [self.models['pose_encoder'](pose_inputs)] axis_angle, translation = self.models['pose_decoder'](pose_features) axis_angle, translation = axis_angle[:, 0], translation[:, 0] outputs[('axis_angle', 0, frame_i)] = axis_angle outputs[('translation', 0, frame_i)] = translation # Invert the matrix such that it is always frame 0 -> frame X outputs[('cam_T_cam', 0, frame_i)] = transformation_from_parameters(axis_angle, translation, invert=frame_i < 0) return outputs def _reconstruct_images( self, inputs: Dict[Any, Tensor], outputs: Dict[Any, Tensor], ) -> None: """Generate the warped (reprojected) color images for a minibatch. Generated images are added to the 'outputs' dictionary. """ batch_size = outputs['disp', self.scales[0]].shape[0] for scale in self.scales: # Upsample the disparity from scale to the target height x width disp = outputs[('disp', scale)] disp = F.interpolate(disp, [self.height, self.width], mode='bilinear', align_corners=False) source_scale = 0 depth = disp_to_depth(disp, self.min_depth, self.max_depth) outputs[('depth', scale)] = depth for i, frame_id in enumerate(self.frame_ids[1:]): T = outputs[('cam_T_cam', 0, frame_id)] if batch_size == 1: cam_points = self.backproject_depth_single[source_scale]( depth, inputs[('inv_camera_matrix', source_scale)]) pixel_coordinates = self.project_3d_single[source_scale]( cam_points, inputs[('camera_matrix', source_scale)], T) else: cam_points = self.backproject_depth[source_scale](depth, inputs[('inv_camera_matrix', source_scale)]) pixel_coordinates = self.project_3d[source_scale](cam_points, inputs[('camera_matrix', source_scale)], T) # Save the warped image outputs[('rgb', frame_id, scale)] = F.grid_sample(inputs[('rgb', frame_id, source_scale)], pixel_coordinates, padding_mode='border', align_corners=True) def _compute_loss( self, inputs: Dict[Any, Tensor], outputs: Dict[Any, Tensor], scales: Optional[Tuple[int, ...]] = None, sample_weights: Optional[Tensor] = None, ) -> Dict[str, Tensor]: """Compute the losses for a minibatch. """ assert self.frame_ids in ((0, -1, 1), (0, -1)) scales = self.scales if scales is None else scales if sample_weights is None: sample_weights = torch.ones(self.batch_size, device=self.device) / self.batch_size source_scale = 0 losses = {} total_loss = torch.zeros(1, device=self.device) for scale in scales: # Compute reprojection loss for every scale ======== target = inputs['rgb', 0, source_scale] reprojection_losses = [] for frame_id in self.frame_ids[1:]: pred = outputs['rgb', frame_id, scale] reprojection_losses.append(self._compute_reprojection_loss(pred, target)) reprojection_losses = torch.cat(reprojection_losses, 1) # ================================================== # Auto-masking ===================================== identity_reprojection_losses = [] for frame_id in self.frame_ids[1:]: pred = inputs['rgb', frame_id, source_scale] identity_reprojection_losses.append(self._compute_reprojection_loss(pred, target)) identity_reprojection_losses = torch.cat(identity_reprojection_losses, 1) # Add random numbers to break ties identity_reprojection_losses += torch.randn(identity_reprojection_losses.shape, device=self.device) * 0.00001 combined = torch.cat((identity_reprojection_losses, reprojection_losses), dim=1) # "minimum among computed losses allows for robust reprojection" # https://openaccess.thecvf.com/content_CVPR_2020/papers/Poggi_On_the_Uncertainty_of_Self-Supervised_Monocular_Depth_Estimation_CVPR_2020_paper.pdf to_optimize, _ = torch.min(combined, dim=1) # Mask potentially dynamic objects ================= if self.mask_dynamic: # 0: dynamic; 1: static mask = 1 - inputs['mask', 0, source_scale].squeeze() mask = mask.type(torch.bool) to_optimize = torch.masked_select(to_optimize, mask) # Also flattens the array # ================================================== # Total self-supervision (reprojection) loss ======= if not self.mask_dynamic: reprojection_loss = (to_optimize.mean(2).mean(1) * sample_weights).sum() else: reprojection_loss = to_optimize.mean() # pre-training with masks losses[f'reprojection_loss/scale_{scale}'] = reprojection_loss # ================================================== # Compute smoothness loss for every scale ========== if self.mask_dynamic: mask = 1 - inputs['mask', 0, scale] mask = mask.type(torch.bool) else: mask = torch.ones_like(outputs['disp', scale], dtype=torch.bool, device=self.device) color = inputs['rgb', 0, scale] disp = outputs['disp', scale] mean_disp = disp.mean(2, True).mean(3, True) norm_disp = disp / (mean_disp + 1e-7) smooth_loss = self._compute_smooth_loss(norm_disp, color, mask) smooth_loss = (smooth_loss * sample_weights).sum() losses[f'smooth_loss/scale_{scale}'] = smooth_loss # ================================================== regularization_loss = self.disparity_smoothness / (2 ** scale) * smooth_loss losses[f'reg_loss/scale_{scale}'] = regularization_loss # ================================================== loss = reprojection_loss + regularization_loss losses[f'depth_loss/scale_{scale}'] = loss total_loss += loss total_loss /= len(self.scales) losses['depth_loss'] = total_loss # Velocity supervision loss (scale independent) ==== if self.velocity_loss_scaling is not None and self.velocity_loss_scaling > 0: velocity_loss = self.velocity_loss_scaling * self._compute_velocity_loss( inputs, outputs) velocity_loss = (velocity_loss * sample_weights).sum() losses['velocity_loss'] = velocity_loss total_loss += velocity_loss # ================================================== losses['loss'] = total_loss if np.isnan(losses['loss'].item()): for k, v in losses.items(): print(k, v.item()) raise RuntimeError('NaN loss') return losses # ============================================================ # Losses def _compute_velocity_loss( self, inputs: Dict[Any, Tensor], outputs: Dict[Any, Tensor], ) -> Tensor: batch_size = inputs['index'].shape[0] # might be different from self.batch_size velocity_loss = torch.zeros(batch_size, device=self.device).squeeze() num_frames = 0 for frame in self.frame_ids: if frame == -1: continue if frame == 0: pred_translation = outputs[('translation', 0, -1)] else: # frame == 1 pred_translation = outputs[('translation', 0, 1)] gt_distance = torch.abs(inputs[('relative_distance', frame)]).squeeze() pred_distance = torch.linalg.norm(pred_translation, dim=-1).squeeze() velocity_loss += F.l1_loss(pred_distance, gt_distance, reduction='none') # separated by sample in batch num_frames += 1 velocity_loss /= num_frames return velocity_loss @staticmethod def _compute_smooth_loss( disp: Tensor, img: Tensor, mask: Tensor, ) -> Tensor: """Computes the smoothness loss for a disparity image The color image is used for edge-aware smoothness """ grad_disp_x = torch.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:]) grad_disp_y = torch.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :]) grad_img_x = torch.mean(torch.abs(img[:, :, :, :-1] - img[:, :, :, 1:]), 1, keepdim=True) grad_img_y = torch.mean(torch.abs(img[:, :, :-1, :] - img[:, :, 1:, :]), 1, keepdim=True) grad_disp_x *= torch.exp(-grad_img_x) grad_disp_y *= torch.exp(-grad_img_y) grad_disp_x = torch.masked_select(grad_disp_x, mask[..., :-1]) grad_disp_y = torch.masked_select(grad_disp_y, mask[..., :-1, :]) batch_size = disp.shape[0] smooth_loss = torch.empty(batch_size, device=disp.device) for i in range(batch_size): _grad_disp_x = torch.masked_select(grad_disp_x[i, ...], mask[i, :, :, :-1]) _grad_disp_y = torch.masked_select(grad_disp_y[i, ...], mask[i, :, :-1, :]) smooth_loss[i] = _grad_disp_x.mean() + _grad_disp_y.mean() return smooth_loss def _compute_reprojection_loss( self, pred: Tensor, target: Tensor, ) -> Tensor: """Computes reprojection loss between a batch of predicted and target images This is the photometric error """ abs_diff = torch.abs(target - pred) l1_loss = abs_diff.mean(1, True) ssim_loss = self.ssim(pred, target).mean(1, True) reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss return reprojection_loss # ============================================================ # Logging def save_prediction( self, inputs: Dict[Any, Tensor], outputs: Dict[Any, Tensor], ) -> None: save_folder = self.log_path / 'prediction' / f'val_depth_{self.epoch:03}' save_folder.mkdir(parents=True, exist_ok=True) pil_image = None # Iterate over the images for i, index in enumerate(inputs['index']): rgb = inputs['rgb', 0, 0][i] depth = outputs['depth', 0][i] rgb_np = rgb.squeeze().cpu().detach().numpy() rgb_np = np.moveaxis(rgb_np, 0, -1) # Convert from [c, h, w] to [h, w, c] depth_np = depth.squeeze().cpu().detach().numpy() fig = plt.figure(figsize=(12.8, 9.6)) plt.subplot(211) plt.imshow(rgb_np) plt.title('Input') plt.axis('off') plt.subplot(212) vmax = np.percentile(depth_np, 95) plt.imshow(depth_np, cmap='magma_r', vmax=vmax) plt.title(f'Depth prediction | vmax={vmax:.3f}') plt.axis('off') save_file = save_folder / f'{index.item():05}.png' # fig.suptitle(str(save_file)[-50:]) fig.canvas.draw() if pil_image is None: pil_image = Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb()) elif pil_image.size[0] < 5 * self.width: pil_image = h_concat_images( pil_image, Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())) plt.savefig(save_file, bbox_inches='tight') plt.close() if self.use_wandb: wandb.log({'pred_depth': [wandb.Image(pil_image)]}) def _init_wandb(self): # Name of the run as shown in the wandb GUI name = self.log_path.name wandb.init(project='continual_slam', name=name) wandb.config.dataset_type = self.dataset_type wandb.config.train_set = self.train_set wandb.config.val_set = self.val_set wandb.config.height = self.height wandb.config.width = self.width wandb.config.batch_size = self.batch_size wandb.config.num_workers = self.num_workers wandb.config.resnet_depth = self.resnet_depth wandb.config.resnet_pose = self.resnet_pose wandb.config.learning_rate = self.learning_rate wandb.config.scheduler_step_size = self.scheduler_step_size wandb.config.min_depth = self.min_depth wandb.config.max_depth = self.max_depth wandb.config.disparity_smoothness = self.disparity_smoothness wandb.config.velocity_loss_scaling = self.velocity_loss_scaling wandb.config.mask_dynamic = self.mask_dynamic wandb.config.log_path = self.log_path ================================================ FILE: depth_pose_prediction/networks/__init__.py ================================================ from depth_pose_prediction.networks.depth_decoder import DepthDecoder from depth_pose_prediction.networks.layers import ( SSIM, BackprojectDepth, Project3D, ) from depth_pose_prediction.networks.pose_decoder import PoseDecoder from depth_pose_prediction.networks.resnet_encoder import ResnetEncoder ================================================ FILE: depth_pose_prediction/networks/depth_decoder.py ================================================ # Adapted from: # https://github.com/nianticlabs/monodepth2/blob/master/networks/depth_decoder.py from typing import Dict, Tuple import numpy as np import torch import torch.nn.functional as F from torch import Tensor, nn from .layers import Conv3x3, ConvBlock class DepthDecoder(nn.Module): def __init__( self, num_ch_encoder: np.ndarray, scales: Tuple[int, ...] = (0, 1, 2, 3), use_skips: bool = True, ) -> None: super().__init__() self.scales = scales self.use_skips = use_skips self.num_output_channels = 1 self.num_ch_encoder = num_ch_encoder self.num_ch_decoder = np.array([16, 32, 64, 128, 256]) self.convs = {} for i in range(4, -1, -1): # upconv_0 num_ch_in = self.num_ch_encoder[-1] if i == 4 else self.num_ch_decoder[i + 1] num_ch_out = self.num_ch_decoder[i] setattr(self, f'upconv_{i}_0', ConvBlock(num_ch_in, num_ch_out)) # upconv_1 num_ch_in = self.num_ch_decoder[i] if self.use_skips and i > 0: num_ch_in += self.num_ch_encoder[i - 1] num_ch_out = self.num_ch_decoder[i] setattr(self, f'upconv_{i}_1', ConvBlock(num_ch_in, num_ch_out)) for s in self.scales: setattr(self, f'dispconv_{s}', Conv3x3(self.num_ch_decoder[s], self.num_output_channels)) self.sigmoid = nn.Sigmoid() self.output = {} def forward(self, input_features: Tensor) -> Dict[Tuple[str, int], Tensor]: self.output = {} x = input_features[-1] for i in range(4, -1, -1): x = getattr(self, f'upconv_{i}_0')(x) if self.use_skips and i > 0: # Difference to monodepth2 implementation to deal with image resolutions # that cannot be integer-divided by 2, 4, 8, etc. # Only required when evaluating the depth x = [F.interpolate(x, size=input_features[i - 1].shape[2:], mode='nearest')] x += [input_features[i - 1]] else: x = [F.interpolate(x, scale_factor=2, mode='nearest')] x = torch.cat(x, 1) x = getattr(self, f'upconv_{i}_1')(x) if i in self.scales: # monodepth2 paper (w/o uncertainty) self.output[('disp', i)] = self.sigmoid(getattr(self, f'dispconv_{i}')(x)) return self.output ================================================ FILE: depth_pose_prediction/networks/layers.py ================================================ # Adapted from: # https://github.com/nianticlabs/monodepth2/blob/master/layers.py#L64 import numpy as np import torch from torch import Tensor, nn class ConvBlock(nn.Module): """Layer to perform a convolution followed by ELU. """ def __init__( self, in_channels: int, out_channels: int, ) -> None: super().__init__() self.conv = Conv3x3(in_channels, out_channels) self.nonlin = nn.ELU(inplace=True) def forward(self, x: Tensor) -> Tensor: out = self.conv(x) out = self.nonlin(out) return out class Conv3x3(nn.Module): """Layer to pad and convolve input. """ def __init__( self, in_channels: int, out_channels: int, use_reflection: bool = True, ) -> None: super().__init__() if use_reflection: self.pad = nn.ReflectionPad2d(1) else: self.pad = nn.ZeroPad2d(1) self.conv = nn.Conv2d(int(in_channels), int(out_channels), 3) def forward(self, x: Tensor) -> Tensor: out = self.pad(x) out = self.conv(out) return out class BackprojectDepth(nn.Module): """Layer to transform a depth image into a point cloud """ def __init__(self, batch_size: int, height: int, width: int) -> None: super().__init__() self.batch_size = batch_size self.height = height self.width = width meshgrid = np.meshgrid(range(self.width), range(self.height), indexing='xy') self.id_coords = np.stack(meshgrid, axis=0).astype(np.float32) self.id_coords = nn.Parameter(torch.from_numpy(self.id_coords), requires_grad=False) self.ones = nn.Parameter(torch.ones(self.batch_size, 1, self.height * self.width), requires_grad=False) self.pix_coords = torch.unsqueeze( torch.stack([self.id_coords[0].view(-1), self.id_coords[1].view(-1)], 0), 0) self.pix_coords = self.pix_coords.repeat(batch_size, 1, 1) self.pix_coords = nn.Parameter(torch.cat([self.pix_coords, self.ones], 1), requires_grad=False) def forward(self, depth, inv_K): cam_points = torch.matmul(inv_K[:, :3, :3], self.pix_coords) cam_points = depth.view(self.batch_size, 1, -1) * cam_points cam_points = torch.cat([cam_points, self.ones], 1) return cam_points class Project3D(nn.Module): """Layer which projects 3D points into a camera with intrinsics K and at position T """ def __init__(self, batch_size: int, height: int, width: int, eps: float = 1e-7) -> None: super().__init__() self.batch_size = batch_size self.height = height self.width = width self.eps = eps def forward(self, points, K, T): P = torch.matmul(K, T)[:, :3, :] cam_points = torch.matmul(P, points) pixel_coordinates = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) + self.eps) pixel_coordinates = pixel_coordinates.view(self.batch_size, 2, self.height, self.width) pixel_coordinates = pixel_coordinates.permute(0, 2, 3, 1) pixel_coordinates[..., 0] /= self.width - 1 pixel_coordinates[..., 1] /= self.height - 1 pixel_coordinates = (pixel_coordinates - 0.5) * 2 return pixel_coordinates class SSIM(nn.Module): """Layer to compute the SSIM loss between a pair of images """ def __init__(self) -> None: super().__init__() self.mu_x_pool = nn.AvgPool2d(3, 1) self.mu_y_pool = nn.AvgPool2d(3, 1) self.sig_x_pool = nn.AvgPool2d(3, 1) self.sig_y_pool = nn.AvgPool2d(3, 1) self.sig_xy_pool = nn.AvgPool2d(3, 1) self.refl = nn.ReflectionPad2d(1) self.C1 = 0.01**2 self.C2 = 0.03**2 def forward(self, x: Tensor, y: Tensor) -> Tensor: x = self.refl(x) y = self.refl(y) mu_x = self.mu_x_pool(x) mu_y = self.mu_y_pool(y) sigma_x = self.sig_x_pool(x**2) - mu_x**2 sigma_y = self.sig_y_pool(y**2) - mu_y**2 sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2) SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2) return torch.clamp((1 - SSIM_n / SSIM_d) / 2, 0, 1) ================================================ FILE: depth_pose_prediction/networks/pose_decoder.py ================================================ # Adapted from: # https://github.com/nianticlabs/monodepth2/blob/master/networks/pose_decoder.py from typing import Optional, Tuple import numpy as np import torch from torch import Tensor, nn class PoseDecoder(nn.Module): def __init__( self, num_ch_encoder: np.ndarray, num_input_features: int, num_frames_to_predict_for: Optional[int] = None, ) -> None: super().__init__() self.num_ch_encoder = num_ch_encoder self.num_input_features = num_input_features if num_frames_to_predict_for is None: num_frames_to_predict_for = num_input_features - 1 self.num_frames_to_predict_for = num_frames_to_predict_for setattr(self, 'squeeze', nn.Conv2d(self.num_ch_encoder[-1], 256, 1)) setattr(self, 'pose_0', nn.Conv2d(num_input_features * 256, 256, 3, 1, 1)) setattr(self, 'pose_1', nn.Conv2d(256, 256, 3, 1, 1)) setattr(self, 'pose_2', nn.Conv2d(256, 6 * num_frames_to_predict_for, 1)) self.relu = nn.ReLU() self.tanh = nn.Tanh() self.softplus = nn.Softplus() self.sigmoid = nn.Sigmoid() def forward(self, input_features: Tensor) -> Tuple[Tensor, Tensor]: last_features = [f[-1] for f in input_features] cat_features = [self.relu(getattr(self, 'squeeze')(f)) for f in last_features] cat_features = torch.cat(cat_features, 1) out = cat_features for i in range(3): out = getattr(self, f'pose_{i}')(out) if i != 2: out = self.relu(out) out = out.mean(3).mean(2) out = 0.01 * out.view(-1, self.num_frames_to_predict_for, 1, 6) axis_angle = out[..., :3] translation = out[..., 3:] return axis_angle, translation ================================================ FILE: depth_pose_prediction/networks/resnet_encoder.py ================================================ # Adapted from: # https://github.com/nianticlabs/monodepth2/blob/master/networks/resnet_encoder.py from typing import List, Type, Union import numpy as np import torch from torch import Tensor, nn from torch.utils import model_zoo from torchvision import models class ResNetMultiImageInput(models.ResNet): """Constructs a resnet model with varying number of input images. Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py """ def __init__( self, block: Type[Union[models.resnet.BasicBlock, models.resnet.Bottleneck]], layers: List[int], num_input_images: int = 1, ) -> None: super().__init__(block, layers) self.inplanes = 64 self.conv1 = nn.Conv2d(num_input_images * 3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def resnet_multiimage_input( num_layers: int, pretrained: bool = False, num_input_images: int = 1, ) -> models.ResNet: """Constructs a resnet model with varying number of input images. :param num_layers: Specifies the ResNet model to be used :param pretrained: If True, returns a model pre-trained on ImageNet :param num_input_images: Specifies the number of input images. For PoseNet, >1. """ if num_layers not in [18, 50]: raise ValueError('Resnet multi-image can only run with 18 or 50 layers.') block_type = { 18: models.resnet.BasicBlock, 50: models.resnet.Bottleneck, }[num_layers] blocks = { 18: [2, 2, 2, 2], 50: [3, 4, 6, 3], }[num_layers] model = ResNetMultiImageInput(block_type, blocks, num_input_images=num_input_images) if pretrained: loaded = model_zoo.load_url(models.resnet.model_urls[f'resnet{num_layers}']) loaded['conv1.weight'] = torch.cat([loaded['conv1.weight']] * num_input_images, 1) / num_input_images model.load_state_dict(loaded) return model class ResnetEncoder(nn.Module): AVAILABLE_RESNETS = { 18: models.resnet18, 34: models.resnet34, # 50: models.resnet50, } def __init__( self, num_layers: int, pretrained: bool, num_input_images: int = 1, ) -> None: """Constructs a ResNet model. :param num_layers: Specifies the ResNet model to be used :param pretrained: If True, returns a model pre-trained on ImageNet :param num_input_images: Specifies the number of input images. For PoseNet, >1. """ super().__init__() self.num_ch_encoder = np.array([64, 64, 128, 256, 512]) if num_layers not in self.AVAILABLE_RESNETS.keys(): raise ValueError(f'Could not find a ResNet model with {num_layers} layers.') if num_input_images < 1: raise ValueError(f'Invalid value ({num_input_images}) for num_input_images.') if num_input_images == 1: self.resnet = self.AVAILABLE_RESNETS[num_layers](pretrained) else: self.resnet = resnet_multiimage_input(num_layers, pretrained, num_input_images) # From paper if num_layers > 34: self.num_ch_encoder[1:] *= 4 def forward(self, x: Tensor) -> List[Tensor]: features = [] x = (x - 0.45) / 0.225 x = self.resnet.conv1(x) x = self.resnet.bn1(x) features.append(self.resnet.relu(x)) features.append(self.resnet.layer1(self.resnet.maxpool(features[-1]))) features.append(self.resnet.layer2(features[-1])) features.append(self.resnet.layer3(features[-1])) features.append(self.resnet.layer4(features[-1])) return features ================================================ FILE: depth_pose_prediction/pytorch3d.py ================================================ # Copied from: # https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/transforms/rotation_conversions.html import torch import torch.nn.functional as F def quaternion_to_axis_angle(quaternions: torch.Tensor) -> torch.Tensor: """ Convert rotations given as quaternions to axis/angle. Args: quaternions: quaternions with real part first, as tensor of shape (..., 4). Returns: Rotations given as a vector in axis angle form, as a tensor of shape (..., 3), where the magnitude is the angle turned anticlockwise in radians around the vector's direction. """ norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True) half_angles = torch.atan2(norms, quaternions[..., :1]) angles = 2 * half_angles eps = 1e-6 small_angles = angles.abs() < eps sin_half_angles_over_angles = torch.empty_like(angles) sin_half_angles_over_angles[~small_angles] = (torch.sin(half_angles[~small_angles]) / angles[~small_angles]) # for x small, sin(x/2) is about x/2 - (x/2)^3/6 # so sin(x/2)/x is about 1/2 - (x*x)/48 sin_half_angles_over_angles[small_angles] = (0.5 - (angles[small_angles] * angles[small_angles]) / 48) return quaternions[..., 1:] / sin_half_angles_over_angles def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor: """ Convert rotations given as rotation matrices to quaternions. Args: matrix: Rotation matrices as tensor of shape (..., 3, 3). Returns: quaternions with real part first, as tensor of shape (..., 4). """ if matrix.size(-1) != 3 or matrix.size(-2) != 3: raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.") batch_dim = matrix.shape[:-2] m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9, )), dim=-1) q_abs = _sqrt_positive_part( torch.stack( [ 1.0 + m00 + m11 + m22, 1.0 + m00 - m11 - m22, 1.0 - m00 + m11 - m22, 1.0 - m00 - m11 + m22, ], dim=-1, )) # we produce the desired quaternion multiplied by each of r, i, j, k quat_by_rijk = torch.stack( [ torch.stack([q_abs[..., 0]**2, m21 - m12, m02 - m20, m10 - m01], dim=-1), torch.stack([m21 - m12, q_abs[..., 1]**2, m10 + m01, m02 + m20], dim=-1), torch.stack([m02 - m20, m10 + m01, q_abs[..., 2]**2, m12 + m21], dim=-1), torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3]**2], dim=-1), ], dim=-2, ) # We floor here at 0.1 but the exact level is not important; if q_abs is small, # the candidate won't be picked. flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device) quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr)) # if not for numerical problems, quat_candidates[i] should be same (up to a sign), # forall i; we pick the best-conditioned one (with the largest denominator) return quat_candidates[F.one_hot(q_abs.argmax( dim=-1), num_classes=4) > 0.5, : # pyre-ignore[16] ].reshape(batch_dim + (4, )) def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor: """ Returns torch.sqrt(torch.max(0, x)) but with a zero subgradient where x is 0. """ ret = torch.zeros_like(x) positive_mask = x > 0 ret[positive_mask] = torch.sqrt(x[positive_mask]) return ret ================================================ FILE: depth_pose_prediction/utils.py ================================================ from typing import Optional, Tuple, Union import numpy as np import torch from PIL import Image from torch import Tensor from depth_pose_prediction.pytorch3d import ( matrix_to_quaternion, quaternion_to_axis_angle, ) def parameters_from_transformation( transformation: Tensor, as_numpy: bool = False, ) -> Tuple[Tensor, Tensor]: """Convert a 4x4 transformation matrix to a translation vector and the axis angles """ translation_vector = transformation[:, :, :3, 3] axis_angle = quaternion_to_axis_angle(matrix_to_quaternion(transformation[:, :, :3, :3])) if as_numpy: translation_vector = translation_vector.squeeze().cpu().numpy() axis_angle = axis_angle.squeeze().cpu().numpy() return translation_vector, axis_angle # ----------------------------------------------------------------------------- # The code below is adapted from: # https://github.com/nianticlabs/monodepth2/blob/master/layers.py def transformation_from_parameters( axis_angle: Tensor, translation: Tensor, invert: bool = False, ) -> Tensor: """Convert the network's (axis angle, translation) output into a 4x4 matrix """ R = rot_from_axisangle(axis_angle) t = translation.clone() if invert: R = R.transpose(1, 2) t *= -1 T = get_translation_matrix(t) if invert: M = torch.matmul(R, T) else: M = torch.matmul(T, R) return M def get_translation_matrix(translation_vector: Tensor) -> Tensor: """Convert a translation vector into a 4x4 transformation matrix """ T = torch.zeros(translation_vector.shape[0], 4, 4).to(device=translation_vector.device) t = translation_vector.contiguous().view(-1, 3, 1) T[:, 0, 0] = 1 T[:, 1, 1] = 1 T[:, 2, 2] = 1 T[:, 3, 3] = 1 T[:, :3, 3, None] = t return T def rot_from_axisangle(axis_angle: Tensor) -> Tensor: """Convert an axisangle rotation into a 4x4 transformation matrix (adapted from https://github.com/Wallacoloo/printipi) Input 'axis_angle' has to be Bx1x3 This is similar to the code below but has less rounding: from scipy.spatial.transform import Rotation Rotation.from_rotvec(axis_angle).as_matrix() """ angle = torch.norm(axis_angle, 2, 2, True) axis = axis_angle / (angle + 1e-7) ca = torch.cos(angle) sa = torch.sin(angle) C = 1 - ca x = axis[..., 0].unsqueeze(1) y = axis[..., 1].unsqueeze(1) z = axis[..., 2].unsqueeze(1) xs = x * sa ys = y * sa zs = z * sa xC = x * C yC = y * C zC = z * C xyC = x * yC yzC = y * zC zxC = z * xC rot = torch.zeros((axis_angle.shape[0], 4, 4)).to(device=axis_angle.device) rot[:, 0, 0] = torch.squeeze(x * xC + ca) rot[:, 0, 1] = torch.squeeze(xyC - zs) rot[:, 0, 2] = torch.squeeze(zxC + ys) rot[:, 1, 0] = torch.squeeze(xyC + zs) rot[:, 1, 1] = torch.squeeze(y * yC + ca) rot[:, 1, 2] = torch.squeeze(yzC - xs) rot[:, 2, 0] = torch.squeeze(zxC - ys) rot[:, 2, 1] = torch.squeeze(yzC + xs) rot[:, 2, 2] = torch.squeeze(z * zC + ca) rot[:, 3, 3] = 1 return rot def disp_to_depth( disp: Union[Tensor, np.ndarray], min_depth: Optional[float] = None, max_depth: Optional[float] = None, ) -> Union[Tensor, np.ndarray]: """Convert network's sigmoid output into depth prediction The formula for this conversion is given in the 'additional considerations' section of the monodepth2 paper. """ # if sum(x is None for x in [min_depth, max_depth]) not in [0, 2]: # raise ValueError('Either none or both of min_depth and max_depth must be None.') if min_depth is None and max_depth is None: depth = 1 / disp elif max_depth is None: depth = min_depth / disp elif min_depth is None: raise ValueError('min_depth is None') else: min_disp = 1 / max_depth max_disp = 1 / min_depth scaled_disp = min_disp + (max_disp - min_disp) * disp depth = 1 / scaled_disp return depth # ----------------------------------------------------------------------------- def h_concat_images(im1: Image, im2: Image) -> Image: dst = Image.new('RGB', (im1.width + im2.width, im1.height)) dst.paste(im1, (0, 0)) dst.paste(im2, (im1.width, 0)) return dst ================================================ FILE: loop_closure_detection/__init__.py ================================================ import loop_closure_detection.utils from loop_closure_detection.config import LoopClosureDetection as Config from loop_closure_detection.loop_closure_detection import LoopClosureDetection ================================================ FILE: loop_closure_detection/config.py ================================================ import dataclasses from pathlib import Path @dataclasses.dataclass class LoopClosureDetection: config_file: Path detection_threshold: float id_threshold: int num_matches: int ================================================ FILE: loop_closure_detection/encoder.py ================================================ import torch from torch import Tensor from torchvision import models, transforms from torchvision.models.feature_extraction import create_feature_extractor class FeatureEncoder: def __init__(self, device: torch.device): super().__init__() self.device = device # self.model = models.inception_v3(pretrained=True, transform_input=False) self.model = models.mobilenet_v3_small(pretrained=True) self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # self.model.to(self.device) # self.model.eval() # train_nodes, eval_nodes = get_graph_node_names(self.model) # pprint(eval_nodes)s self.model = create_feature_extractor(self.model, return_nodes=['flatten']) self.num_features = 576 self.model.to(self.device) self.model.eval() def __call__(self, image: Tensor) -> Tensor: image = self.normalize(image) image = image.to(self.device) with torch.no_grad(): features = self.model(image)['flatten'] return features ================================================ FILE: loop_closure_detection/loop_closure_detection.py ================================================ from pathlib import Path from typing import List, Tuple import faiss import matplotlib.pyplot as plt import numpy as np import torch from scipy.spatial.distance import cosine from torch import Tensor from loop_closure_detection.config import LoopClosureDetection as Config from loop_closure_detection.encoder import FeatureEncoder class LoopClosureDetection: def __init__( self, config: Config, ): # Initialize parameters =========================== self.threshold = config.detection_threshold self.id_threshold = config.id_threshold self.num_matches = config.num_matches # Fixed parameters ================================ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # ================================================= # Construct network =============================== self.model = FeatureEncoder(self.device) # ================================================= # Feature cache =================================== # Cosine similarity self.faiss_index = faiss.index_factory(self.model.num_features, 'Flat', faiss.METRIC_INNER_PRODUCT) self.image_id_to_index = {} self.index_to_image_id = {} # ================================================= def add(self, image_id: int, image: Tensor) -> None: # Add batch dimension if len(image.shape) == 3: image = image.unsqueeze(dim=0) features = self.model(image).squeeze().cpu().detach().numpy() features = np.expand_dims(features, 0) faiss.normalize_L2(features) # Then the inner product becomes cosine similarity self.faiss_index.add(features) self.image_id_to_index[image_id] = self.faiss_index.ntotal - 1 self.index_to_image_id[self.faiss_index.ntotal - 1] = image_id # print(f'Is Faiss index trained: {self.faiss_index.is_trained}') def search(self, image_id: int) -> Tuple[List[int], List[float]]: index_id = self.image_id_to_index[image_id] features = np.expand_dims(self.faiss_index.reconstruct(index_id), 0) distances, indices = self.faiss_index.search(features, 100) distances = distances.squeeze() indices = indices.squeeze() # Remove placeholder entries without a match distances = distances[indices != -1] indices = indices[indices != -1] # Remove self detection distances = distances[indices != index_id] indices = indices[indices != index_id] # Filter by the threshold indices = indices[distances > self.threshold] distances = distances[distances > self.threshold] # Do not return neighbors (trivial matches) distances = distances[np.abs(indices - index_id) > self.id_threshold] indices = indices[np.abs(indices - index_id) > self.id_threshold] # Return best N matches distances = distances[:self.num_matches] indices = indices[:self.num_matches] # Convert back to image IDs image_ids = sorted([self.index_to_image_id[index_id] for index_id in indices]) return image_ids, distances def predict(self, image_0: Tensor, image_1: Tensor) -> float: features_0 = self.model(image_0) features_1 = self.model(image_1) cos_sim = 1 - cosine(features_0.squeeze().cpu().detach().numpy(), features_1.squeeze().cpu().detach().numpy()) return cos_sim @staticmethod def display_matches(image_0, image_1, image_id_0, image_id_1, transformation, cosine_similarity): # Prevent circular import from slam.transform import \ string_tmat # pylint: disable=import-outside-toplevel if isinstance(image_0, Tensor): image_0 = image_0.squeeze().cpu().detach().permute(1, 2, 0) if isinstance(image_1, Tensor): image_1 = image_1.squeeze().cpu().detach().permute(1, 2, 0) filename = Path(f'./figures/sequence_00/matches/{image_id_0:04}_{image_id_1:04}.png') filename.parent.mkdir(parents=True, exist_ok=True) fig = plt.figure() plt.subplot(211) plt.imshow(image_0) plt.axis('off') plt.title(image_id_0) plt.subplot(212) plt.imshow(image_1) plt.axis('off') plt.title(image_id_1) plt.suptitle(f'cos_sim = {cosine_similarity:.4f} \n {string_tmat(transformation)}') plt.savefig(filename) plt.close(fig) ================================================ FILE: loop_closure_detection/utils.py ================================================ from typing import Optional import matplotlib.pyplot as plt def plot_image_matches( image_0, image_1, image_id_0: Optional[int] = None, image_id_1: Optional[int] = None, cosine_similarity: Optional[float] = None, save_figure: bool = True, ) -> None: fig = plt.figure() plt.subplot(211) plt.imshow(image_0) plt.axis('off') if image_id_0 is not None: plt.title(image_id_0) plt.subplot(212) plt.imshow(image_1) plt.axis('off') if image_id_1 is not None: plt.title(image_id_1) if cosine_similarity is not None: plt.suptitle(f'cos_sim = {cosine_similarity}') if save_figure: assert image_id_0 is not None and image_id_1 is not None plt.savefig(f'./figures/sequence_08/matches/{image_id_0:04}_{image_id_1:04}.png') else: plt.show() plt.close(fig) ================================================ FILE: main_adapt.py ================================================ import random import numpy as np import torch from tqdm import tqdm from config.config_parser import ConfigParser from slam import Slam from slam.utils import calc_error seed = 42 torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) # ============================================================ config = ConfigParser('./config/config_adapt.yaml') print(config) # ============================================================ slam = Slam(config) with tqdm(desc='SLAM', total=len(slam)) as pbar: while slam.current_step < len(slam): losses = slam.step() pbar.set_postfix(depth=f'{losses["depth_loss"]:.5f}', velo=f'{losses["velocity_loss"]:.5f}') pbar.update(1) if slam.do_adaptation: slam.save_model() if slam.logging: slam.plot_metrics() slam.plot_trajectory() slam.pose_graph.visualize_in_meshlab(slam.log_path / 'pose_graph.obj', verbose=True) slam.gt_pose_graph.visualize_in_meshlab(slam.log_path / 'gt_pose_graph.obj', verbose=True) error_log = calc_error(slam.pose_graph.get_all_poses(), slam.gt_pose_graph.get_all_poses()) print(error_log) with open(config.depth_pose.log_path / 'log.txt', 'a', encoding='utf-8') as file: file.write(error_log) ================================================ FILE: main_pretrain.py ================================================ # Use this script to pre-train the depth / pose estimation networks from config.config_parser import ConfigParser from depth_pose_prediction import DepthPosePrediction # ============================================================ config = ConfigParser('./config/config_pretrain.yaml') print(config) # ============================================================ predictor = DepthPosePrediction(config.dataset, config.depth_pose) predictor.train(validate=True, depth_error=True, use_wandb=config.depth_pose.use_wandb) ================================================ FILE: make_cityscapes_buffer.py ================================================ from pathlib import Path from torch.utils.data import DataLoader from tqdm import tqdm from datasets import Cityscapes from slam.replay_buffer import ReplayBuffer # ============================================================ replay_buffer_path = Path( __file__).parent / 'log/cityscapes/replay_buffer' # <-- MATCH WITH config_pretrain.yaml replay_buffer_path.parent.mkdir(parents=True, exist_ok=True) replay_buffer = ReplayBuffer(replay_buffer_path, 'Cityscapes') # ============================================================ dataset = Cityscapes( Path('USER/data/cityscapes'), # <-- ADJUST THIS 'train', [-1, 0, 1], [0, 1, 2, 3], 192, 640, do_augmentation=False, views=('left', ), ) dataloader = DataLoader(dataset, num_workers=12, batch_size=1, shuffle=False, drop_last=True) # ============================================================ with tqdm(total=len(dataloader)) as pbar: for i, sample in enumerate(dataloader): replay_buffer.add(sample, dataset.get_item_filenames(i)) pbar.update(1) replay_buffer.save_state() ================================================ FILE: pyproject.toml ================================================ [tool.yapf] based_on_style = "pep8" column_limit = 100 indent_width = 4 [tool.isort] multi_line_output = 3 include_trailing_comma = true [tool.pylint.master] extension-pkg-whitelist = "cv2" jobs = 0 [tool.pylint.design] min-public-methods = 1 [tool.pylint.refactoring] max-nested-blocks = 6 [tool.pylint.format] max-line-length = 100 ignore-long-lines = "^\\s*(# )??$" max-module-lines=1500 [tool.pylint.typecheck] generated-members = [ "numpy.*", "torch.*", "g2o.*", ] [tool.pylint.string] check-quote-consistency = "yes" [tool.pylint.messages_control] disable = [ "missing-module-docstring", "missing-class-docstring", "missing-function-docstring", "fixme", "too-many-instance-attributes", "too-many-locals", "too-many-arguments", "too-many-statements", "too-many-branches", "unused-variable", "invalid-name", "duplicate-code", ] ================================================ FILE: requirements.txt ================================================ pre-commit yapf pylint torch==1.10.0 torchvision==0.11.1 setuptools==58.2.0 numpy Pillow tqdm matplotlib pyyaml wandb opencv-python colour_demosaicing scipy sklearn scikit-image pandas faiss-gpu ================================================ FILE: slam/__init__.py ================================================ import slam.utils from slam.config import ReplayBuffer as ReplayBufferConfig from slam.config import Slam as Config from slam.meshlab import MeshlabInf from slam.pose_graph_optimization import PoseGraphOptimization from slam.slam import Slam ================================================ FILE: slam/config.py ================================================ import dataclasses from pathlib import Path @dataclasses.dataclass class Slam: config_file: Path dataset_sequence: int adaptation: bool adaptation_epochs: int min_distance: float start_frame: int logging: bool do_loop_closures: bool keyframe_frequency: int lc_distance_poses: int @dataclasses.dataclass class ReplayBuffer: config_file: Path maximize_diversity: bool max_buffer_size: int similarity_threshold: float similarity_sampling: bool load_path: Path ================================================ FILE: slam/meshlab.py ================================================ # pylint: skip-file from subprocess import call from typing import List, Tuple, Union import cv2 import matplotlib.cm import numpy as np from tqdm import trange DEFAULT_COLOR = (0, 0, 0) class MeshlabInf: @staticmethod def show_multi_layer(**kwargs): """ usage: given a set of MeshlabInf objects and their names, displays them in layer format. example: o1 = MeshlabInf() o2 = MeshlabInf() ... MeshlabInf().show_multi_layer(layerA=o1,layerB=o2,...) """ d = dict(**kwargs) for k, v in d.items(): v.write(k + ".obj") call(["meshlab"] + [x + ".obj" for x in d.keys()]) @staticmethod def plot3d(pts, false_color=False): oo = MeshlabInf() oo.add_points(pts) oo.show(false_color) @staticmethod def get_colormap(sz, cmap_name="jet"): colormap = matplotlib.cm.get_cmap(cmap_name)(np.linspace(0, 1, sz))[:, 0:3] return colormap def __init__(self, global_transformation=np.eye(4)): self.global_transformation = global_transformation self._xyzrgb = np.empty((0, 6)) self._faces = [] self._lines = [] def clear(self): self.__init__() def add_camera(self, p, color=DEFAULT_COLOR, scale=0.1, rotation=np.eye(3), camera_matrix=np.eye(3)): q = 1 / camera_matrix[0, 0] b = 1 / camera_matrix[1, 1] u = camera_matrix[-1, 0] / q v = camera_matrix[-1, 1] / b pts = ( np.array( [+u, +v, +0, -q, -b, +1, -q, +b, +1, +q, +b, +1, +q, -b, +1, +q, +v, +0, +u, +b, +0] ).reshape(-1, 3) * scale ) zup_from_zfwd = np.array([[0.0, -1.0, -0.0], [0.0, 0.0, -1.0], [1.0, 0.0, 0.0]]) pts = pts @ zup_from_zfwd pts = pts @ rotation.T pts = pts + np.array(p).flatten() n = self._xyzrgb.shape[0] self.add_points(pts, color) f = np.array([0, 1, 2, 0, 2, 3, 0, 3, 4, 0, 4, 1, 0, 5, 6]).reshape(-1, 3) self.add_faces(list(f + n)) def add_line(self, p1, p2, c=None): self.add_points(p1, c) self.add_points(p2, c) self.add_points(p2, c) n = self._xyzrgb.shape[0] self._lines.append([n - 3, n - 2, n - 1]) def add_mesh(self, xyz, c=None): m, n, _ = xyz.shape if c is None: c = np.ones((m, n, 3)) elif len(c.shape) == 2: c = np.repeat(np.expand_dims(c, 2), 3, axis=2) elif len(c.shape) == 3: pass else: raise Exception("unknown color size") good_ind = np.arange(0, m * n).reshape(m, n) va = np.vstack((good_ind[:-1:, :-1:].flatten(), good_ind[:-1:, 1::].flatten(), good_ind[1::, :-1:].flatten())).T vb = np.vstack( (good_ind[1::, 1::].flatten(), good_ind[1::, :-1:].flatten(), good_ind[:-1:, 1::].flatten())).T v = np.vstack((va, vb)) v = v[:, [1, 0, 2]] xyzrgb = np.hstack((xyz.reshape(m * n, 3), c.reshape(m * n, 3))) ok = np.all(~np.isnan(xyzrgb), axis=1) good_ind = np.arange(0, xyzrgb.shape[0])[ok] bad_ind = np.arange(0, xyzrgb.shape[0])[~ok] xyzrgb = xyzrgb[good_ind, :] for i in bad_ind: v = v[np.all(v != i, axis=1), :] for i, idx in enumerate(good_ind): v[v == idx] = i n = self._xyzrgb.shape[0] self.add_points(xyzrgb) self.add_faces(list(v + n)) def add_points(self, xyz, color=None): xyz_ = xyz.copy() if len(xyz_.shape) == 1: xyz_ = xyz_.reshape(1, -1) xyz_ = xyz_[~np.any(np.isnan(xyz_), axis=1), :] n = xyz_.shape[0] if color is None: if xyz_.shape[1] == 3: xyz_ = np.hstack((xyz_, np.ones((n, 3)) * xyz_[:, 2:])) elif xyz_.shape[1] == 4: xyz_ = np.hstack((xyz_, np.ones((n, 2)) * xyz_[:, 3:])) elif xyz_.shape[1] == 6: pass else: raise Exception("unknown points dimension") elif isinstance(color, np.ndarray) and color.shape[0] == xyz.shape[0] and color.shape[1] == 3: xyz_ = np.c_[xyz_[:, :3], color] else: if len(color) != 3: raise Exception("color should be 3 elem vector") xyz_ = np.c_[xyz_[:, :3], np.ones((n, 1)) * color] self._xyzrgb = np.vstack((self._xyzrgb, xyz_)) def add_pgon(self, xyz, color=None): xyz = xyz[~np.any(np.isnan(xyz), axis=1), :] n = self._xyzrgb.shape[0] self.add_points(xyz, color) self.add_faces([list(np.arange(n, n + xyz.shape[0]))]) def add_faces(self, verts): self._faces += verts def read(self, fname): raise NotImplementedError("function 'read' is not yet implemented") def show(self, false_color=False): fn = "temp.obj" self.write(fn, false_color) call(["meshlab", fn]) # call(['rm', fn]) def write(self, fname, false_color=False, verbose=True): xyzrgb = self._xyzrgb.copy() if false_color: colind = np.mean(self._xyzrgb[:, 3:], axis=1) mm = norm_range_01(colind, (1, 99)) colind = (mm * 255).astype(int) colormap = self.get_colormap(256) for i in range(xyzrgb.shape[0]): z = xyzrgb[i, 2] if np.isnan(z): xyzrgb[i, 3:] = np.array([0, 0, 0]) else: xyzrgb[i, 3:] = colormap[colind[i], :] colormap2 = self.get_colormap(len(self._faces)) for j, p in enumerate(self._faces): for i in p: xyzrgb[i, 3:] = colormap2[j, :] else: xyzrgb[:, 3:] = norm_range_01(xyzrgb[:, 3:]) xyzrgb[:, 3:] += 1 - np.max(xyzrgb[:, 3:]) with open(fname, "w", encoding='utf-8') as f: f.write("# OBJ file\n") for i in trange(xyzrgb.shape[0], desc="Writing points to meshlab file", disable=not verbose): x = xyzrgb[i, :] x[:3] = self.global_transformation[:3, :3] @ x[:3] + self.global_transformation[:3, -1] f.write("v %.4f %.4f %.4f %.4f %.4f %.4f\n" % (x[0], x[1], x[2], x[3], x[4], x[5])) for p in self._faces: f.write("f") for i in p: f.write(" %d" % (i + 1)) f.write("\n") for p in self._lines: f.write("f") for i in p: f.write(" %d" % (i + 1)) f.write("\n") f.close() def norm_range_01(v: np.ndarray, prcnt: tuple = None) -> np.ndarray: """ normalize the input values in range of [0-1]. """ if np.all(np.isnan(v)): return v if prcnt is None: min_value = np.nanmin(v) max_value = np.nanmax(v) else: if len(prcnt) != 2 or prcnt[0] >= prcnt[1] or prcnt[0] < 0 or prcnt[1] > 100: raise Exception("input #2 should contain hi and low percentage within [0-100]") min_value, max_value = np.nanpercentile(v, prcnt) d = max_value - min_value if d < np.finfo(type(min_value)).eps: d = 1 v_out = (v - min_value) / d v_out = np.clip(v_out, 0, 1) return v_out def rotation_matrix_from_to(v_from: Union[List, Tuple, np.ndarray], v_to: Union[List, Tuple, np.ndarray], output4x4: bool = False) -> np.ndarray: """ calculate the rotation matrix the describes the rotation from input vector v_from to input vector v_to. input vectors must be of length 3 """ assert len(v_from) == 3 assert len(v_to) == 3 v_to = (v_to / np.linalg.norm(v_to)).flatten() v_from = (v_from / np.linalg.norm(v_from)).flatten() if np.all(np.abs(v_to - v_from) < np.finfo(float).eps * 1e3): axis = v_to angle = 0 else: axis = np.cross(v_from, v_to) nrm = np.linalg.norm(axis) if nrm == 0: # co-linear rr = np.random.randn(3) axis = rr - (v_from.T @ rr) * v_from axis = axis / np.linalg.norm(axis) angle = np.pi else: axis /= nrm angle = np.arccos(min(1.0, v_to @ v_from)) r, _ = cv2.Rodrigues(axis * angle) if output4x4: outmat = np.eye(4) outmat[:3, :3] = r else: outmat = r return outmat ================================================ FILE: slam/pose_graph_optimization.py ================================================ import g2o import numpy as np from slam.meshlab import MeshlabInf class PoseGraphOptimization(g2o.SparseOptimizer): def __init__(self): self.edge_vertices = set() self.num_loop_closures = 0 super().__init__() solver = g2o.BlockSolverSE3(g2o.LinearSolverCholmodSE3()) solver = g2o.OptimizationAlgorithmLevenberg(solver) super().set_algorithm(solver) # See https://github.com/RainerKuemmerle/g2o/issues/34 self.se3_offset_id = 0 se3_offset = g2o.ParameterSE3Offset() se3_offset.set_id(self.se3_offset_id) super().add_parameter(se3_offset) def __str__(self): string = f'Vertices: {len(self.vertex_ids)}\n' string += f'Edges: {len(self.edge_vertices)}\n' string += f'Loops: {self.num_loop_closures}' return string @property def vertex_ids(self): return sorted(list(self.vertices().keys())) def optimize(self, max_iterations=1000, verbose=False): super().initialize_optimization() super().set_verbose(verbose) super().optimize(max_iterations) def add_vertex(self, vertex_id, pose, fixed=False): v_se3 = g2o.VertexSE3() v_se3.set_id(vertex_id) v_se3.set_estimate(g2o.Isometry3d(pose)) v_se3.set_fixed(fixed) super().add_vertex(v_se3) def add_vertex_point(self, vertex_id, point, fixed=False): v_point = g2o.VertexPointXYZ() v_point.set_id(vertex_id) v_point.set_estimate(point) v_point.set_fixed(fixed) super().add_vertex(v_point) def add_edge(self, vertices, measurement, information=np.eye(6), robust_kernel=None, is_loop_closure=False): self.edge_vertices.add(vertices) if is_loop_closure: self.num_loop_closures += 1 edge = g2o.EdgeSE3() for i, v in enumerate(vertices): if isinstance(v, int): v = self.vertex(v) edge.set_vertex(i, v) edge.set_measurement(g2o.Isometry3d(measurement)) # relative pose edge.set_information(information) # robust_kernel = g2o.RobustKernelHuber() if robust_kernel is not None: edge.set_robust_kernel(robust_kernel) super().add_edge(edge) def add_edge_pose_point(self, vertex_pose, vertex_point, measurement, information=np.eye(3), robust_kernel=None): edge = g2o.EdgeSE3PointXYZ() edge.set_vertex(0, self.vertex(vertex_pose)) edge.set_vertex(1, self.vertex(vertex_point)) edge.set_measurement(measurement) edge.set_information(information) if robust_kernel is not None: edge.set_robust_kernel(robust_kernel) edge.set_parameter_id(0, self.se3_offset_id) super().add_edge(edge) def get_pose(self, vertex_id): return self.vertex(vertex_id).estimate().matrix() def get_all_poses(self): return [self.get_pose(i) for i in self.vertex_ids] def get_transform(self, vertex_id_src, vertex_id_dst): pose_src = self.get_pose(vertex_id_src) pose_dst = self.get_pose(vertex_id_dst) transform = np.linalg.inv(pose_src) @ pose_dst return transform def does_edge_exists(self, vertex_id_a, vertex_id_b): return (vertex_id_a, vertex_id_b) in self.edge_vertices or (vertex_id_b, vertex_id_a) in self.edge_vertices def is_vertex_in_any_edge(self, vertex_id): vertices = set() for edge in self.edge_vertices: vertices.add(edge[0]) vertices.add(edge[1]) return vertex_id in vertices def does_vertex_have_only_global_edges(self, vertex_id): assert self.is_vertex_in_any_edge(vertex_id) for edge in self.edge_vertices: if vertex_id not in edge: continue if np.abs(edge[0] - edge[1]) == 1: return False return True def visualize_in_meshlab(self, filename, meshlab=None, verbose=True): if len(self.vertex_ids) > 0: points = {} for vertex_id, vertex in self.vertices().items(): if isinstance(vertex, g2o.VertexSE3): points[vertex_id] = vertex.estimate().matrix()[:3, 3] if meshlab is None: meshlab = MeshlabInf() for point in points.values(): meshlab.add_points(point) for edge in self.edge_vertices: meshlab.add_line(points[edge[0]], points[edge[1]]) # for vertex_id, vertex in self.vertices().items(): # if isinstance(vertex, g2o.VertexSE3): # p = vertex.estimate().matrix()[:3, 3] # r = vertex.estimate().matrix()[:3, :3] # meshlab.add_camera(p, rotation=r) meshlab.write(filename, verbose=verbose) ================================================ FILE: slam/replay_buffer.py ================================================ import os import pickle import shutil from pathlib import Path from typing import Any, Dict, List, Optional import faiss import numpy as np import torch from PIL import Image from torch import Tensor from torch.utils.data import Dataset as TorchDataset from torchvision import transforms from datasets.utils import get_random_color_jitter from loop_closure_detection.encoder import FeatureEncoder class ReplayBuffer(TorchDataset): def __init__( self, storage_dir: Path, dataset_type: str, state_path: Optional[Path] = None, height: int = 0, width: int = 0, scales: List[int] = None, frames: List[int] = None, num_workers: int = 1, do_augmentation: bool = False, batch_size: int = 1, maximize_diversity: bool = False, max_buffer_size: int = np.iinfo(int).max, similarity_threshold: float = 1, similarity_sampling: bool = True, ): self.storage_dir = storage_dir # self._reset_storage_dir() self.dataset_type = dataset_type.lower() self.num_workers = num_workers self.do_augmentation = do_augmentation self.batch_size = batch_size # Restrict size of the replay buffer self.NUMBER_SAMPLES_PER_ENVIRONMENT = 100 self.valid_indices = {} self.buffer_filenames = {} self.online_filenames = [] # Precompute the resize functions for each scale relative to the previous scale # If scales is None, the size of the raw data will be used self.scales = scales self.frames = frames self.resize = {} if self.scales is not None: for s in self.scales: exp_scale = 2 ** s self.resize[s] = transforms.Resize( (height // exp_scale, width // exp_scale), interpolation=transforms.InterpolationMode.LANCZOS) # Ensure repeatability of experiments self.target_sampler = np.random.default_rng(seed=42) # Dissimilarity-based buffer self.similarity_sampling = similarity_sampling self.maximize_diversity = maximize_diversity self.buffer_size = max_buffer_size self.similarity_threshold = similarity_threshold self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.feature_encoder = FeatureEncoder(self.device) self.faiss_index = None self.faiss_index_offset = 0 self.distance_matrix = None self.distance_matrix_indices = None if state_path is not None: self.load_state(state_path) def add(self, sample: Dict[str, Any], sample_filenames: Dict[str, Any], image_features: Optional[Tensor] = None, verbose: bool = False): # pylint: disable=no-value-for-parameter index = sample['index'].item() assert index == sample_filenames['index'] index += self.faiss_index_offset if self.faiss_index is None: if image_features is None: num_features = self.feature_encoder.num_features else: num_features = image_features.shape[1] self.faiss_index = faiss.IndexIDMap( faiss.index_factory(num_features, 'Flat', faiss.METRIC_INNER_PRODUCT)) if image_features is None: image_features = self.feature_encoder(sample['rgb', 0, 0]).detach().cpu().numpy() faiss.normalize_L2(image_features) # The inner product becomes cosine similarity add_sample = False remove_sample = None if self.maximize_diversity: # Only add if sufficiently dissimilar to the existing samples if self.faiss_index.ntotal == 0: similarity = 0 else: similarity = self.faiss_index.search(image_features, 1)[0][0][0] if similarity < self.similarity_threshold: self.faiss_index.add_with_ids(image_features, np.array([index])) add_sample = True if verbose: print(f'Added sample {index} to the replay buffer | similarity {similarity}') if self.faiss_index.ntotal > self.buffer_size: # Maximize the diversity in the replay buffer if self.distance_matrix is None: features = self.faiss_index.index.reconstruct_n(0, self.faiss_index.ntotal) dist_mat, matching = self.faiss_index.search(features, self.faiss_index.ntotal) for i in range(self.faiss_index.ntotal): dist_mat[i, :] = dist_mat[i, matching[i].argsort()] self.distance_matrix = dist_mat self.distance_matrix_indices = faiss.vector_to_array( self.faiss_index.id_map) else: # Only update the elements that actually change fill_up_index = np.argwhere(self.distance_matrix_indices < 0)[0, 0] a, b = self.faiss_index.search(image_features, self.faiss_index.ntotal) self.distance_matrix_indices[fill_up_index] = index sorter = np.argsort(b[0]) sorter_idx = sorter[ np.searchsorted(b[0], self.distance_matrix_indices, sorter=sorter)] a = a[:, sorter_idx][0] self.distance_matrix[fill_up_index, :] = self.distance_matrix[:, fill_up_index] = a # Subtract self-similarity remove_index_tmp = np.argmax( self.distance_matrix.sum(0) - self.distance_matrix.diagonal()) self.distance_matrix[:, remove_index_tmp] = self.distance_matrix[ remove_index_tmp, :] = -1 remove_index = self.distance_matrix_indices[remove_index_tmp] self.distance_matrix_indices[remove_index_tmp] = -1 self.faiss_index.remove_ids(np.array([remove_index])) remove_sample = remove_index if verbose: print(f'Removed sample {remove_index} from the replay buffer') else: self.faiss_index.add_with_ids(image_features, np.array([index])) add_sample = True if self.faiss_index.ntotal > self.buffer_size: remove_index = self.target_sampler.choice(self.faiss_index.ntotal, 1)[0] remove_sample = faiss.vector_to_array(self.faiss_index.id_map)[remove_index] self.faiss_index.remove_ids(np.array([remove_sample])) # if verbose: # print(f'Removed sample {remove_sample} from the target buffer') if add_sample: filename = self.storage_dir / f'{self.dataset_type}_{index:>05}.pkl' data = { key: value for key, value in sample.items() if 'index' in key or 'camera_matrix' in key or 'inv_camera_matrix' in key or 'relative_distance' in key } data['rgb', -1] = sample_filenames['images'][0] data['rgb', 0] = sample_filenames['images'][1] data['rgb', 1] = sample_filenames['images'][2] with open(filename, 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) self.online_filenames.append(filename) if remove_sample is not None: for filename in self.online_filenames: if f'_{remove_sample:>05}.pkl' in filename.name: os.remove(filename) self.online_filenames.remove(filename) break def get(self, sample: Dict[str, Any], image_features: Optional[Tensor] = None) -> Dict[ str, Any]: return_data = {} # Sample from target buffer if self.online_filenames and self.batch_size > 0: index = sample['index'].item() + self.faiss_index_offset filename = self.storage_dir / f'{self.dataset_type}_{index:>05}.pkl' # The current sample is the only one that is in the buffer if len(self.online_filenames) == 1 and filename in self.online_filenames: replace = True num_samples = 1 sampling_prob = None else: # Do not sample the current sample if filename in self.online_filenames: num_samples = len(self.online_filenames) - 1 # -1 for the current sample else: num_samples = len(self.online_filenames) replace = self.batch_size > num_samples if self.similarity_sampling: assert self.faiss_index.ntotal > 0 if image_features is None: image_features = self.feature_encoder( sample['rgb', 0, 0]).detach().cpu().numpy() faiss.normalize_L2( image_features) # The inner product becomes cosine similarity similarity, indices = self.faiss_index.search(image_features, self.faiss_index.ntotal) if index in indices: similarity = np.delete(similarity, np.argwhere(indices == index)) else: similarity = similarity[0] dissimilarity = 1 - similarity # sampling_prob = dissimilarity / dissimilarity.sum() sampling_prob = similarity / similarity.sum() else: sampling_prob = None indices = self.target_sampler.choice(num_samples, self.batch_size, replace, sampling_prob) filenames = [self.online_filenames[index] for index in indices] return_data = self._get(filenames[0]) for filename in filenames[1:]: data = self._get(filename) for key in return_data: return_data[key] = torch.cat([return_data[key], data[key]]) return return_data def save_state(self): filename = self.storage_dir / 'buffer_state.pkl' data = {'filenames': self.online_filenames, 'faiss_index': self.faiss_index} with open(filename, 'wb') as f: pickle.dump(data, f) print(f'Saved reply buffer state to: {filename}') for key, value in self.buffer_filenames.items(): print(f'{key + ":":<12} {len(value):>5}') def load_state(self, state_path: Path): with open(state_path, 'rb') as f: data = pickle.load(f) # self.buffer_filenames = data['filenames'] self.faiss_index = data['faiss_index'] self.faiss_index_offset = faiss.vector_to_array(self.faiss_index.id_map).max() self.online_filenames = [state_path.parent / file.name for file in data['filenames']] print(f'Load replay buffer state from: {state_path}') for key, value in self.buffer_filenames.items(): print(f'{key + ":":<12} {len(value):>5}') def __getitem__(self, index: int) -> Dict[Any, Tensor]: raise NotImplementedError def __len__(self): return 1000000 # Fixed number as the sampling is handled in the get() function def _get(self, filename, include_batch=True): if self.do_augmentation: color_augmentation = get_random_color_jitter((0.8, 1.2), (0.8, 1.2), (0.8, 1.2), (-.1, .1)) with open(filename, 'rb') as f: data = pickle.load(f) for frame in self.frames: rgb = Image.open(data['rgb', frame]).convert('RGB') rgb = self.resize[0](rgb) data['rgb', frame, 0] = rgb for scale in self.scales: if scale == 0: continue data['rgb', frame, scale] = self.resize[scale](data['rgb', frame, scale - 1]) for scale in self.scales: data['rgb', frame, scale] = transforms.ToTensor()(data['rgb', frame, scale]) if include_batch: data['rgb', frame, scale] = data['rgb', frame, scale].unsqueeze(0) if self.do_augmentation: data['rgb_aug', frame, scale] = color_augmentation(data['rgb', frame, scale]) else: data['rgb_aug', frame, scale] = data['rgb', frame, scale] del data['rgb', frame] # Removes the filename string if not include_batch: for key in data: if not ('rgb' in key or 'rgb_aug' in key): data[key] = data[key].squeeze(0) return data def _reset_storage_dir(self): if self.storage_dir.exists(): shutil.rmtree(self.storage_dir) self.storage_dir.mkdir(parents=True, exist_ok=True) ================================================ FILE: slam/slam.py ================================================ import pickle import matplotlib.pyplot as plt import numpy as np import torch from torch.utils.data import DataLoader from datasets import Kitti, Robotcar from depth_pose_prediction import DepthPosePrediction from loop_closure_detection import LoopClosureDetection from slam.pose_graph_optimization import PoseGraphOptimization from slam.replay_buffer import ReplayBuffer from slam.utils import calc_depth_error, rotation_error, translation_error PLOTTING = True class Slam: def __init__(self, config): # Config ========================================= self.config = config self.online_dataset_type = config.dataset.dataset self.online_dataset_path = config.dataset.dataset_path self.do_adaptation = config.slam.adaptation self.adaptation_epochs = config.slam.adaptation_epochs self.min_distance = config.slam.min_distance self.start_frame = config.slam.start_frame self.logging = config.slam.logging if not self.do_adaptation: config.depth_pose.batch_size = 1 self.log_path = config.depth_pose.log_path self.log_path.mkdir(parents=True, exist_ok=True) self.do_loop_closures = config.slam.do_loop_closures self.keyframe_frequency = config.slam.keyframe_frequency self.lc_distance_poses = config.slam.lc_distance_poses # Depth / pose predictor ========================== self.predictor = DepthPosePrediction(config.dataset, config.depth_pose, use_online=False) self.predictor.load_model(load_optimizer=False) # Dataloader ====================================== if self.online_dataset_type == 'Kitti': self.online_dataset = Kitti( self.online_dataset_path, config.slam.dataset_sequence, config.dataset.frame_ids, config.dataset.scales, config.dataset.height, config.dataset.width, poses=True, # Ground truth poses with_depth=False, min_distance=config.slam.min_distance, ) elif self.online_dataset_type == 'RobotCar': if config.slam.dataset_sequence == 1: start_frame, end_frame = 750, 4750 else: start_frame, end_frame = 22100, 26100 self.online_dataset = Robotcar( self.online_dataset_path, '2015-08-12-15-04-18', config.dataset.frame_ids, config.dataset.scales, config.dataset.height, config.dataset.width, poses=True, # Ground truth poses min_distance=config.slam.min_distance, start_frame=start_frame, end_frame=end_frame, every_n_frame=2, ) else: raise ValueError('Unsupported dataset type.') self.online_dataloader = DataLoader( self.online_dataset, batch_size=1, # Simulates online loading shuffle=False, # Simulates online loading num_workers=config.depth_pose.num_workers, pin_memory=True, drop_last=True) self.online_dataloader_iter = iter(self.online_dataloader) if self.do_adaptation and config.depth_pose.batch_size > 1: # and False: replay_buffer_path = config.replay_buffer.load_path replay_buffer_path.mkdir(parents=True, exist_ok=True) replay_buffer_state_path = replay_buffer_path / 'buffer_state.pkl' replay_buffer_state_path = replay_buffer_state_path if \ replay_buffer_state_path.exists() else None self.replay_buffer = ReplayBuffer( replay_buffer_path, self.online_dataset_type, replay_buffer_state_path, self.online_dataset.height, self.online_dataset.width, self.online_dataset.scales, self.online_dataset.frame_ids, do_augmentation=True, batch_size=config.depth_pose.batch_size - 1, maximize_diversity=config.replay_buffer.maximize_diversity, max_buffer_size=config.replay_buffer.max_buffer_size, similarity_threshold=config.replay_buffer.similarity_threshold, similarity_sampling=config.replay_buffer.similarity_sampling, ) else: self.replay_buffer = None # Pose graph backend ============================== self.loop_closure_detection = LoopClosureDetection(config.loop_closure) self.pose_graph = PoseGraphOptimization() if self.start_frame == 0: self.pose_graph.add_vertex(0, self.online_dataset.global_poses[1], fixed=True) # self.pose_graph.add_vertex(0, np.eye(4), fixed=True) self.gt_pose_graph = PoseGraphOptimization() # Used for visualization self.gt_pose_graph.add_vertex(0, self.online_dataset.global_poses[1], fixed=True) # Auxiliary variables ============================= self.current_step = 0 self.since_last_loop_closures = self.lc_distance_poses # Logging ========================================= # Track the relative error per step self.rel_trans_error = [] self.rel_rot_error = [] # Track the losses of the online data self.depth_loss = [] self.velocity_loss = [] # Track the depth error per step self.depth_error = [] self.depth_loss_threshold = -1 # .04 self.velo_loss_threshold = -1 def __len__(self): return len(self.online_dataset) def step(self): self.current_step += 1 # Combine online and replay data ================== online_data = next(self.online_dataloader_iter) self.predictor._set_eval() with torch.no_grad(): online_image = online_data['rgb', 0, 0].to(self.predictor.device) online_features = self.predictor.models['depth_encoder'](online_image)[4].detach() online_features = online_features.mean(-1).mean(-1).cpu().numpy() if self.replay_buffer is not None: self.replay_buffer.add(online_data, self.online_dataset.get_item_filenames(self.current_step - 1), online_features, verbose=True) if self.replay_buffer is not None: replay_data = self.replay_buffer.get(online_data, online_features) if replay_data: training_data = self._cat_dict(online_data, replay_data) else: training_data = online_data else: training_data = online_data # ================================================= # Use the measured velocity for this check if self.current_step > 1 and online_data['relative_distance', 1] < self.min_distance: print(f'skip: {online_data["relative_distance", 1].detach().cpu().numpy()[0]}') return {'depth_loss': 0, 'velocity_loss': 0} # Depth / pose prediction ========================= # The returned losses are wrt the online data if self.do_adaptation: # Update the network weights outputs, losses = self.predictor.adapt(online_data, training_data, steps=self.adaptation_epochs) else: outputs, losses = self.predictor.adapt(online_data, None) # Extract input/output for online data image = online_data['rgb', 1, 0] if torch.sign(online_data['relative_distance', 1]) < 0: transformation = outputs['cam_T_cam', 0, 1][0, :] else: transformation = torch.linalg.inv(outputs['cam_T_cam', 0, 1][0, :]) # Move to CPU for further processing transformation = transformation.squeeze().cpu().detach().numpy() for k, v in losses.items(): losses[k] = float(v.squeeze().cpu().detach().numpy()) if 'velocity_loss' not in losses: losses['velocity_loss'] = 0 if 'depth_loss' not in losses: losses['depth_loss'] = 0 # ================================================= # Ground truth poses ============================== gt_transformation = online_data['relative_pose', 1].squeeze().cpu().detach().numpy() gt_pose = online_data['absolute_pose', 1].squeeze().cpu().detach().numpy() self.gt_pose_graph.add_vertex(self.current_step, gt_pose) self.gt_pose_graph.add_edge((self.gt_pose_graph.vertex_ids[-2], self.current_step), gt_transformation) # ================================================= # Pose graph ====================================== # Mapping can start later to account for initial warming up to the dataset if self.current_step == self.start_frame: self.pose_graph.add_vertex(self.current_step, gt_pose, fixed=True) print(f'Start mapping at frame {self.current_step}') elif self.current_step > self.start_frame: # Initialize with predicted odometry odom_pose = self.pose_graph.get_pose(self.pose_graph.vertex_ids[-1]) @ transformation self.pose_graph.add_vertex(self.current_step, odom_pose) cov_matrix = np.eye(6) cov_matrix[2, 2] = .1 cov_matrix[5, 5] = .1 self.pose_graph.add_edge((self.pose_graph.vertex_ids[-2], self.current_step), transformation, information=np.linalg.inv(cov_matrix)) # ================================================= # Loop closure detection ========================== optimized = False if self.do_loop_closures and self.current_step >= self.start_frame: self.loop_closure_detection.add(self.current_step, image.squeeze()) if not self.current_step % self.keyframe_frequency and self.current_step < 4000: if self.since_last_loop_closures > self.lc_distance_poses: lc_step_ids, distances = self.loop_closure_detection.search(self.current_step) for i, d in zip(lc_step_ids, distances): lc_image = self.online_dataset[i - 1]['rgb', 1, 0] lc_transformation, cov_matrix = self.predictor.predict_pose(image, lc_image, as_numpy=True) graph_transformation = self.pose_graph.get_transform(self.current_step, i) print(f'{self.current_step} --> {i} ' f'[sim={d:.3f}, pred_dist={np.linalg.norm(lc_transformation):.1f}m, ' f'graph_dist={np.linalg.norm(graph_transformation):.1f}m]') # LoopClosureDetection.display_matches(image, lc_image, self.current_step, # i, lc_transformation, d) cov_matrix = np.eye(6) cov_matrix[2, 2] = .1 cov_matrix[5, 5] = .1 self.pose_graph.add_edge((self.current_step, i), lc_transformation, information=.5 * np.linalg.inv(cov_matrix), is_loop_closure=True) if len(lc_step_ids) > 0: self.pose_graph.optimize(max_iterations=10000, verbose=False) optimized = True if optimized: self.since_last_loop_closures = 0 else: self.since_last_loop_closures += 1 # ================================================= # Track metrics =================================== if self.logging: # Relative error of prediction rel_err = np.linalg.inv(gt_transformation) @ transformation self.rel_trans_error.append(translation_error(rel_err)) self.rel_rot_error.append(rotation_error(rel_err)) # Loss self.depth_loss.append(losses['depth_loss']) self.velocity_loss.append(losses['velocity_loss']) # Depth error if self.online_dataset_type == 'Kitti': self.depth_error.append( calc_depth_error(outputs['depth', 0][0, ...].squeeze().detach().cpu().numpy(), online_data['depth', 0, -1][0, ...].squeeze().detach().cpu().numpy(), min_depth=self.predictor.min_depth, max_depth=self.predictor.max_depth)) # Plot the tracked metrics if PLOTTING and (not self.current_step % 100 or optimized): self.plot_metrics() self.plot_trajectory() self.pose_graph.visualize_in_meshlab(self.log_path / 'pose_graph.obj', verbose=False) self.gt_pose_graph.visualize_in_meshlab(self.log_path / 'gt_pose_graph.obj', verbose=False) # ================================================= return losses def save_metrics(self) -> None: data = { 'rel_trans_error': self.rel_trans_error, 'rel_rot_error': self.rel_rot_error, 'depth_loss': self.depth_loss, 'velocity_loss': self.velocity_loss, 'depth_error': self.depth_error, } filename = self.log_path / 'metrics.pkl' with open(filename, 'wb') as f: pickle.dump(data, f) def save_model(self) -> None: self.predictor.save_model() if self.replay_buffer is not None: self.replay_buffer.save_state() @staticmethod def _cat_dict(dict_1, dict_2): """ Concatenate the elements of online input dictionary with the corresponding elements of the replay buffer dictionary """ res_dict = {} for key in dict_1: if key in dict_2: res_dict[key] = torch.cat([dict_1[key], dict_2[key]]) return res_dict @staticmethod def _pose_graph_to_2d_trajectory(pose_graph): # Returns the trajectory in X-Z dimension poses = pose_graph.get_all_poses() trajectory = np.asarray([[p[0, 3], p[2, 3]] for p in poses]) return trajectory def plot_trajectory(self): pred_trajectory = self._pose_graph_to_2d_trajectory(self.pose_graph) gt_trajectory = self._pose_graph_to_2d_trajectory(self.gt_pose_graph) fig = plt.figure() plt.plot(pred_trajectory[:, 0], pred_trajectory[:, 1], '--.', label='pred') plt.plot(gt_trajectory[:, 0], gt_trajectory[:, 1], '--.', label='gt') plt.axis('equal') plt.legend() plt.title(f'Step = {self.current_step}') # filename = self.log_path / 'trajectory' / f'step_{self.current_step:04}.png' # filename.parent.mkdir(parents=True, exist_ok=True) # plt.savefig(str(filename)) filename = self.log_path / 'trajectory.png' plt.savefig(str(filename)) plt.close(fig) np.save(self.log_path / 'trajectory.npy', pred_trajectory) np.save(self.log_path / 'gt_trajectory.npy', gt_trajectory) def plot_metrics(self, filename: str = 'metrics.png'): if self.depth_error: fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(12, 6)) else: # fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(16, 12)) fig, axs = plt.subplots(nrows=2, ncols=2) # Losses axs[0, 0].plot(self.depth_loss) axs[0, 0].axhline(self.depth_loss_threshold, color='r') axs[0, 0].set_ylim(bottom=0, top=1.1 * max(self.depth_loss)) axs[0, 0].set_xlabel('Step') axs[0, 0].set_ylabel('Depth loss') axs[0, 0].set_title('Depth loss') axs[1, 0].plot(self.velocity_loss) axs[1, 0].axhline(self.velo_loss_threshold, color='r') axs[1, 0].set_ylim(bottom=0, top=1.1 * max(self.velocity_loss)) axs[1, 0].set_xlabel('Step') axs[1, 0].set_ylabel('Velocity loss') axs[1, 0].set_title('Velocity loss') # Relative errors axs[0, 1].plot(self.rel_trans_error) axs[0, 1].set_ylim(bottom=0) axs[0, 1].set_xlabel('Step') axs[0, 1].set_ylabel('Relative trans. error') axs[0, 1].set_title('Relative trans. error') axs[1, 1].plot(self.rel_rot_error) axs[1, 1].set_ylim(bottom=0) axs[1, 1].set_xlabel('Step') axs[1, 1].set_ylabel('Relative rot. error') axs[1, 1].set_title('Relative rot. error') # Depth error if self.depth_error: axs[0, 2].plot([x['abs_rel'] for x in self.depth_error]) axs[0, 2].set_ylim(bottom=0) axs[0, 2].set_xlabel('Step') axs[0, 2].set_ylabel('Abs rel') axs[0, 2].set_title('Abs rel / ARD') axs[1, 2].plot([x['sq_rel'] for x in self.depth_error]) axs[1, 2].set_ylim(bottom=0) axs[1, 2].set_xlabel('Step') axs[1, 2].set_ylabel('Sq rel') axs[1, 2].set_title('Sq rel / SRD') axs[0, 3].plot([x['rmse'] for x in self.depth_error]) axs[0, 3].set_ylim(bottom=0) axs[0, 3].set_xlabel('Step') axs[0, 3].set_ylabel('RMSE') axs[0, 3].set_title('RMSE') axs[1, 3].plot([x['a1'] for x in self.depth_error]) axs[1, 3].set_ylim(top=1) axs[1, 3].set_xlabel('Step') axs[1, 3].set_ylabel('A1') axs[1, 3].set_title('A1') fig.tight_layout() plt.savefig(self.log_path / filename, bbox_inches='tight') plt.close(fig) ================================================ FILE: slam/transform.py ================================================ import numpy as np from scipy.spatial.transform import Rotation def print_tmat(tmat, note=''): return print_sixdof(tmat2sixdof(tmat), note) def print_array(array, note=''): return print_sixdof(array2sixdof(array), note) def print_sixdof(sixdof, note=''): print(string_sixdof(sixdof, note)) def string_tmat(tmat, note=''): return string_sixdof(tmat2sixdof(tmat), note) def string_sixdof(sixdof, note=''): string = f'R({np.rad2deg(sixdof["rx"]):>7.3f}, {np.rad2deg(sixdof["ry"]):>7.3f}, ' \ f'{np.rad2deg(sixdof["rz"]):>7.3f}), T({sixdof["tx"]:>7.3f}, ' \ f'{sixdof["ty"]:>7.3f}, {sixdof["tz"]:>7.3f}) {note}' return string def create_empty_sixdof(): sixdof = {'tx': 0, 'ty': 0, 'tz': 0, 'rx': 0, 'ry': 0, 'rz': 0} return sixdof def tmat2sixdof(tmat): r = Rotation.from_matrix(tmat[:3, :3]).as_rotvec() sixdof = { 'tx': tmat[0, 3], 'ty': tmat[1, 3], 'tz': tmat[2, 3], 'rx': r[0], 'ry': r[1], 'rz': r[2] } return sixdof def sixdof2tmat(sixdof): tmat = np.eye(4) tmat[:3, :3] = Rotation.from_rotvec([sixdof['rx'], sixdof['ry'], sixdof['rz']]).as_matrix() tmat[0, 3] = sixdof['tx'] tmat[1, 3] = sixdof['ty'] tmat[2, 3] = sixdof['tz'] return tmat def tmat2array(tmat): sixdof = tmat2sixdof(tmat) array = np.zeros((6, 1)) array[0] = sixdof['rx'] array[1] = sixdof['ry'] array[2] = sixdof['rz'] array[3] = sixdof['tx'] array[4] = sixdof['ty'] array[5] = sixdof['tz'] return array.T.ravel() def array2tmat(array): return sixdof2tmat(array2sixdof(array)) def array2sixdof(array): array_ = array.T sixdof = { 'rx': array_[0], 'ry': array_[1], 'rz': array_[2], 'tx': array_[3], 'ty': array_[4], 'tz': array_[5] } return sixdof def apply_transformation(transformation: np.ndarray, input_data: np.ndarray) -> np.ndarray: if len(input_data.shape) != 2 and len(input_data.shape) != 3: raise RuntimeError('data should be a 2D or 3D array') if len(transformation.shape) != 2: raise RuntimeError('transformation should be a 2D array') if transformation.shape[0] != transformation.shape[1]: raise RuntimeError('transformation should be square matrix') if len(input_data.shape) == 2: d = input_data.shape[1] input_data_ = input_data else: d = input_data.shape[2] input_data_ = input_data.reshape(-1, 3) if transformation.shape[0] != d + 1: raise RuntimeError('transformation dimension mismatch') if np.max(np.abs(transformation[-1, :] - np.r_[np.zeros(d), 1])) > np.finfo(float).eps * 1e3: raise RuntimeError('bad transformation') x_t = np.c_[input_data_, np.ones((input_data_.shape[0], 1))] @ transformation.T x_t = x_t[:, :d].reshape(input_data.shape) return x_t ================================================ FILE: slam/utils.py ================================================ import copy import pickle from typing import Dict, List, Optional, Tuple, Union import cv2 import matplotlib.pyplot as plt import numpy as np from torch import Tensor from depth_pose_prediction.networks.layers import BackprojectDepth from slam.meshlab import MeshlabInf def save_data(fname, obj): with open(fname, 'wb') as f: pickle.dump(obj, f) def load_data(fname): with open(fname, 'rb') as f: data = pickle.load(f) return data def depth_to_pcl(backproject_depth: BackprojectDepth, inv_camera_matrix: Tensor, depth: Tensor, image: Tensor, batch_size: int = 1, dist_threshold: float = np.inf): pcl = backproject_depth(depth, inv_camera_matrix) pcl = pcl.squeeze().cpu().detach().numpy()[:3, :].T color = image.view(batch_size, 3, -1).squeeze().cpu().detach().numpy().T pcl = np.c_[pcl, color] if not np.isinf(dist_threshold): dist = np.linalg.norm(pcl[:, :3], axis=1) pcl = pcl[dist < dist_threshold, :] return pcl def pcl_to_image( pcl: np.ndarray, camera_matrix: np.ndarray, image_shape: Tuple[int, int], ) -> np.ndarray: projection = cv2.projectPoints(pcl[:, :3].astype(np.float64), (0, 0, 0), (0, 0, 0), camera_matrix, np.zeros(4))[0].squeeze() image = np.zeros((image_shape[0], image_shape[1], 3)) depth = np.ones((image_shape[0], image_shape[1], 1)) * np.inf for i, (u, v) in enumerate(projection): u, v = int(np.floor(u)), int(np.floor(v)) if not (0 <= v < image_shape[0] and 0 <= u < image_shape[1]): continue distance = np.linalg.norm(pcl[i, :3]) if distance < depth[v, u]: depth[v, u] = distance image[v, u] = pcl[i, 3:] return image def save_point_cloud( filename: str, pcl: Union[np.ndarray, List[np.ndarray]], global_pose_list: Optional[np.ndarray] = None, verbose: bool = True, ) -> None: if global_pose_list is not None: accumulated_pcl = accumulate_pcl(pcl, global_pose_list) else: accumulated_pcl = pcl meshlab = MeshlabInf() meshlab.add_points(accumulated_pcl) meshlab.write(filename, verbose=verbose) def accumulate_pcl(pcl_list: List[np.ndarray], global_pose_list: np.ndarray) -> np.ndarray: accumulated_pcl = [] for i, (pcl, tmat) in enumerate(zip(pcl_list, global_pose_list)): accumulated_pcl.append(np.c_[(np.c_[pcl[:, :3], np.ones( (pcl.shape[0], 1))] @ tmat.T)[:, :3], pcl[:, 3:]]) accumulated_pcl = np.concatenate(accumulated_pcl) return accumulated_pcl def generate_figure( batch_i, image, depth, image_pcl, gt_xz, pred_xz, save_figure: bool = True, ) -> None: fig = plt.figure(figsize=(8, 10)) plt.subplot(411) plt.imshow(image) plt.axis('off') plt.title('Current frame') plt.subplot(412) vmax = np.percentile(depth, 95) plt.imshow(depth, cmap='magma_r', vmax=vmax) plt.axis('off') plt.title(f'Predicted depth (vmax={vmax:.3f})') plt.subplot(413) plt.imshow(image_pcl) plt.axis('off') plt.title('Projected PCL (w/o current frame)') plt.subplot(414) plt.plot(gt_xz[:, 0], gt_xz[:, 1], label='gt') plt.plot(pred_xz[:, 0], pred_xz[:, 1], label='pred') plt.xlim((-400, 15)) plt.ylim((-60, 160)) # plt.xlim((-10, 10)) # plt.ylim((-10, 10)) plt.legend() plt.tight_layout() if save_figure: plt.savefig(f'./figures/sequence_08/{batch_i:05}.png', ) else: plt.show() plt.close(fig) # ============================================================================= # Adapted from: # https://github.com/Huangying-Zhan/kitti-odom-eval/blob/master/kitti_odometry.py def scale_optimization(pred_poses, gt_poses): """ Optimize scaling factor """ # 2D trajectory if isinstance(pred_poses, np.ndarray) and pred_poses.shape[1] == 2: scaling = scale_lse_solver(pred_poses, gt_poses) pred_scaled = scaling * pred_poses # 3D poses (6 DoF) elif isinstance(pred_poses, list) and pred_poses[0].shape == (4, 4): # Scale only translation but keep rotation pred_xyz = np.asarray([p[:3, 3] for p in pred_poses]) gt_xyz = np.asarray([p[:3, 3] for p in gt_poses]) scaling = scale_lse_solver(pred_xyz, gt_xyz) pred_scaled = copy.deepcopy(pred_poses) for p in pred_scaled: p[:3, 3] *= scaling else: assert False return pred_scaled, scaling def scale_lse_solver(X, Y): """Least-squares-error solver Compute optimal scaling factor so that s(X)-Y is minimum Args: X (KxN array): current data Y (KxN array): reference data Returns: scale (float): scaling factor """ scale = np.sum(X * Y) / np.sum(X**2) return scale def trajectory_distances(poses): """Compute distance for each pose w.r.t frame-0 """ xyz = [p[:3, 3] for p in poses] dist = [0] for i in range(1, len(poses)): d = dist[i - 1] + np.linalg.norm(xyz[i] - xyz[i - 1]) dist.append(d) return dist def last_frame_from_segment_length(dist, first_frame, length): """Find frame (index) that away from the first_frame with the required distance Args: dist (float list): distance of each pose w.r.t frame-0 first_frame (int): start-frame index length (float): required distance Returns: i (int) / -1: end-frame index. if not found return -1 """ for i in range(first_frame, len(dist), 1): if dist[i] > (dist[first_frame] + length): return i return -1 def rotation_error(pose_error): """Compute rotation error Args: pose_error (4x4 array): relative pose error Returns: rot_error (float): rotation error """ a = pose_error[0, 0] b = pose_error[1, 1] c = pose_error[2, 2] d = 0.5 * (a + b + c - 1.0) rot_error = np.arccos(max(min(d, 1.0), -1.0)) return rot_error def translation_error(pose_error): """Compute translation error Args: pose_error (4x4 array): relative pose error Returns: trans_error (float): translation error """ dx = pose_error[0, 3] dy = pose_error[1, 3] dz = pose_error[2, 3] trans_error = np.sqrt(dx**2 + dy**2 + dz**2) return trans_error def calc_sequence_errors(pred_poses, gt_poses): """calculate sequence error """ error = [] dist = trajectory_distances(gt_poses) step_size = 10 sequence_lengths = [100, 200, 300, 400, 500, 600, 700, 800] num_lengths = len(sequence_lengths) for first_frame in range(0, len(gt_poses), step_size): for i in range(num_lengths): length = sequence_lengths[i] last_frame = last_frame_from_segment_length(dist, first_frame, length) # Continue if sequence is not long enough if last_frame == -1: continue # Compute rotational and translational errors pose_delta_gt = np.linalg.inv(gt_poses[first_frame]) @ gt_poses[last_frame] pose_delta_pred = np.linalg.inv(pred_poses[first_frame]) @ pred_poses[last_frame] pose_error = np.linalg.inv(pose_delta_pred) @ pose_delta_gt rot_error = rotation_error(pose_error) / length trans_error = translation_error(pose_error) / length # compute speed num_frames = last_frame - first_frame + 1 speed = length / (0.1 * num_frames) # Assume 10fps error.append([first_frame, rot_error, trans_error, length, speed]) return error def compute_segment_error(seq_errs): """This function calculates average errors for different segment. Args: seq_errs (list list): list of errs; [first_frame, rotation error, translation error, length, speed] - first_frame: frist frame index - rotation error: rotation error per length - translation error: translation error per length - length: evaluation trajectory length - speed: car speed (#FIXME: 10FPS is assumed) Returns: avg_segment_errs (dict): {100:[avg_t_err, avg_r_err],...} """ sequence_lengths = [100, 200, 300, 400, 500, 600, 700, 800] segment_errs = {} avg_segment_errs = {} for len_ in sequence_lengths: segment_errs[len_] = [] # Get errors for err in seq_errs: len_ = err[3] t_err = err[2] r_err = err[1] segment_errs[len_].append([t_err, r_err]) # Compute average for len_ in sequence_lengths: if segment_errs[len_] != []: avg_t_err = np.mean(np.asarray(segment_errs[len_])[:, 0]) avg_r_err = np.mean(np.asarray(segment_errs[len_])[:, 1]) avg_segment_errs[len_] = [avg_t_err, avg_r_err] else: avg_segment_errs[len_] = [] return avg_segment_errs def compute_overall_err(seq_err): """Compute average translation & rotation errors Args: seq_err (list list): [[r_err, t_err],[r_err, t_err],...] - r_err (float): rotation error - t_err (float): translation error Returns: ave_t_err (float): average translation error ave_r_err (float): average rotation error """ t_err = 0 r_err = 0 seq_len = len(seq_err) if seq_len == 0: return 0, 0 for item in seq_err: r_err += item[1] t_err += item[2] ave_t_err = t_err / seq_len ave_r_err = r_err / seq_len return ave_t_err, ave_r_err def compute_ATE(pred_poses, gt_poses): """Compute RMSE of ATE (abs. trajectory error) """ errors = [] for pred_pose, gt_pose in zip(pred_poses, gt_poses): gt_xyz = gt_pose[:3, 3] pred_xyz = pred_pose[:3, 3] align_err = gt_xyz - pred_xyz errors.append(np.sqrt(np.sum(align_err**2))) ate = np.sqrt(np.mean(np.asarray(errors)**2)) return ate def compute_RPE(pred_poses, gt_poses): """Compute RPE (rel. pose error) Returns: rpe_trans rpe_rot """ trans_errors = [] rot_errors = [] for i in range(len(pred_poses) - 1): # for i in list(pred.keys())[:-1]: gt1 = gt_poses[i] gt2 = gt_poses[i + 1] gt_rel = np.linalg.inv(gt1) @ gt2 pred1 = pred_poses[i] pred2 = pred_poses[i + 1] pred_rel = np.linalg.inv(pred1) @ pred2 rel_err = np.linalg.inv(gt_rel) @ pred_rel trans_errors.append(translation_error(rel_err)) rot_errors.append(rotation_error(rel_err)) rpe_trans = np.mean(np.asarray(trans_errors)) rpe_rot = np.mean(np.asarray(rot_errors)) return rpe_trans, rpe_rot def calc_error(pred_poses, gt_poses, optimize_scale: bool = False) -> str: log = '' if optimize_scale: pred_poses_scaled, scaling = scale_optimization(pred_poses, gt_poses) log += '-' * 10 + ' MEDIAN\n' log += f'Scaling: {scaling}' else: pred_poses_scaled = pred_poses sequence_error = calc_sequence_errors(pred_poses_scaled, gt_poses) # segment_error = compute_segment_error(sequence_error) ave_t_err, ave_r_err = compute_overall_err(sequence_error) log += '-' * 10 + '\n' log += f'Trans error (%): {ave_t_err * 100:.4f}' + '\n' log += f'Rot error (deg/100m): {100 * ave_r_err / np.pi * 180:.4f}' + '\n' # Compute ATE ate = compute_ATE(pred_poses, gt_poses) log += f'Abs traj RMSE (m): {ate:.4f}' + '\n' # Compute RPE rpe_trans, rpe_rot = compute_RPE(pred_poses, gt_poses) log += f'Rel pose error (m): {rpe_trans:.4f}' + '\n' log += f'Rel pose err (deg): {rpe_rot * 180 / np.pi:.4f}' + '\n' log += '-' * 10 + '\n' return log # ============================================================================= def calc_depth_error( pred_depth, gt_depth, median_scaling: bool = True, min_depth: Optional[float] = None, max_depth: Optional[float] = None, ) -> Dict[str, float]: gt_height, gt_width = gt_depth.shape pred_depth = cv2.resize(pred_depth, (gt_width, gt_height)) # Mask out pixels without ground truth depth # or ground truth depth farther away than the maximum predicted depth if max_depth is not None: mask = np.logical_and(gt_depth > min_depth, gt_depth < max_depth) else: mask = gt_depth > min_depth pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] # Introduced by SfMLearner if median_scaling: ratio = np.median(gt_depth) / np.median(pred_depth) pred_depth *= ratio # Cap predicted depth at min and max depth pred_depth[pred_depth < min_depth] = min_depth if max_depth is not None: pred_depth[pred_depth > max_depth] = max_depth # Compute error metrics thresh = np.maximum((gt_depth / pred_depth), (pred_depth / gt_depth)) a1 = np.mean(thresh < 1.25) a2 = np.mean(thresh < 1.25**2) a3 = np.mean(thresh < 1.25**3) rmse = (gt_depth - pred_depth)**2 rmse_tot = np.sqrt(np.mean(rmse)) rmse_log = (np.log(gt_depth) - np.log(pred_depth))**2 rmse_log_tot = np.sqrt(np.mean(rmse_log)) abs_diff = np.mean(np.abs(gt_depth - pred_depth)) abs_rel = np.mean(np.abs(gt_depth - pred_depth) / gt_depth) sq_rel = np.mean(((gt_depth - pred_depth)**2) / gt_depth) metrics = { 'abs_diff': abs_diff, 'abs_rel': abs_rel, 'sq_rel': sq_rel, 'a1': a1, 'a2': a2, 'a3': a3, 'rmse': rmse_tot, 'rmse_log': rmse_log_tot } return metrics ================================================ FILE: third_party/fix_g2opy.py ================================================ #!/usr/bin/env python3 # pylint: disable = line-too-long import os from pathlib import Path INPUT_FILE = Path(__file__).parent / 'g2opy' / 'python' / 'core' / 'eigen_types.h' assert INPUT_FILE.exists() tmp_file = Path(str(INPUT_FILE) + '_tmp') with open(INPUT_FILE, 'r', encoding='utf-8') as input_file: with open(tmp_file, 'w', encoding='utf-8') as output_file: for i, line in enumerate(input_file.readlines()): if line == ' .def("x", (double (Eigen::Quaterniond::*) () const) &Eigen::Quaterniond::x)\n': line = ' .def("x", (double &(Eigen::Quaterniond::*)()) & Eigen::Quaterniond::x)\n' elif line == ' .def("y", (double (Eigen::Quaterniond::*) () const) &Eigen::Quaterniond::y)\n': line = ' .def("y", (double &(Eigen::Quaterniond::*)()) & Eigen::Quaterniond::y)\n' elif line == ' .def("z", (double (Eigen::Quaterniond::*) () const) &Eigen::Quaterniond::z)\n': line = ' .def("z", (double &(Eigen::Quaterniond::*)()) & Eigen::Quaterniond::z)\n' elif line == ' .def("w", (double (Eigen::Quaterniond::*) () const) &Eigen::Quaterniond::w)\n': line = ' .def("w", (double &(Eigen::Quaterniond::*)()) & Eigen::Quaterniond::w)\n' output_file.write(line) os.rename(tmp_file, INPUT_FILE)