Repository: apple/ml-stable-diffusion Branch: main Commit: e12202c1f640 Files: 61 Total size: 1.8 MB Directory structure: gitextract_srx0ds09/ ├── .github/ │ └── pull_request_template.md ├── .gitignore ├── ACKNOWLEDGEMENTS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.md ├── Package.swift ├── README.md ├── python_coreml_stable_diffusion/ │ ├── __init__.py │ ├── _version.py │ ├── activation_quantization.py │ ├── attention.py │ ├── chunk_mlprogram.py │ ├── controlnet.py │ ├── coreml_model.py │ ├── layer_norm.py │ ├── mixed_bit_compression_apply.py │ ├── mixed_bit_compression_pre_analysis.py │ ├── multilingual_projection.py │ ├── pipeline.py │ ├── torch2coreml.py │ └── unet.py ├── requirements.txt ├── setup.py ├── swift/ │ ├── StableDiffusion/ │ │ ├── pipeline/ │ │ │ ├── CGImage+vImage.swift │ │ │ ├── ControlNet.swift │ │ │ ├── DPMSolverMultistepScheduler.swift │ │ │ ├── Decoder.swift │ │ │ ├── DiscreteFlowScheduler.swift │ │ │ ├── Encoder.swift │ │ │ ├── ManagedMLModel.swift │ │ │ ├── MultiModalDiffusionTransformer.swift │ │ │ ├── MultilingualTextEncoder.swift │ │ │ ├── NumPyRandomSource.swift │ │ │ ├── NvRandomSource.swift │ │ │ ├── RandomSource.swift │ │ │ ├── ResourceManaging.swift │ │ │ ├── SafetyChecker.swift │ │ │ ├── SampleTimer.swift │ │ │ ├── Scheduler.swift │ │ │ ├── StableDiffusion3Pipeline+Resources.swift │ │ │ ├── StableDiffusion3Pipeline.swift │ │ │ ├── StableDiffusionPipeline+Resources.swift │ │ │ ├── StableDiffusionPipeline.Configuration.swift │ │ │ ├── StableDiffusionPipeline.swift │ │ │ ├── StableDiffusionXL+Resources.swift │ │ │ ├── StableDiffusionXLPipeline.swift │ │ │ ├── TextEncoder.swift │ │ │ ├── TextEncoderT5.swift │ │ │ ├── TextEncoderXL.swift │ │ │ ├── TorchRandomSource.swift │ │ │ └── Unet.swift │ │ └── tokenizer/ │ │ ├── BPETokenizer+Reading.swift │ │ ├── BPETokenizer.swift │ │ └── T5Tokenizer.swift │ ├── StableDiffusionCLI/ │ │ └── main.swift │ └── StableDiffusionTests/ │ ├── Resources/ │ │ ├── merges.txt │ │ └── vocab.json │ └── StableDiffusionTests.swift └── tests/ ├── __init__.py └── test_stable_diffusion.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/pull_request_template.md ================================================ Thank you for your interest in contributing to Core ML Stable Diffusion! Please review [CONTRIBUTING.md](../CONTRIBUTING.md) first. We appreciate your interest in the project! ================================================ FILE: .gitignore ================================================ *~ # Swift Package .DS_Store /.build /Packages /*.xcodeproj .swiftpm .vscode .*.sw? *.docc-build *.vs Package.resolved # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # macOS filesystem *.DS_Store ================================================ FILE: ACKNOWLEDGEMENTS ================================================ Acknowledgements Portions of this software may utilize the following copyrighted material, the use of which is hereby acknowledged. _____________________ The Hugging Face team (diffusers) Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. The Hugging Face team (transformers) Copyright 2018- The Hugging Face team. All rights reserved. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Facebook, Inc (PyTorch) From PyTorch: Copyright (c) 2016- Facebook, Inc (Adam Paszke) Copyright (c) 2014- Facebook, Inc (Soumith Chintala) Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) Copyright (c) 2011-2013 NYU (Clement Farabet) Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute (Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) From Caffe2: Copyright (c) 2016-present, Facebook Inc. All rights reserved. All contributions by Facebook: Copyright (c) 2016 Facebook Inc. All contributions by Google: Copyright (c) 2015 Google Inc. All rights reserved. All contributions by Yangqing Jia: Copyright (c) 2015 Yangqing Jia All rights reserved. All contributions by Kakao Brain: Copyright 2019-2020 Kakao Brain All contributions by Cruise LLC: Copyright (c) 2022 Cruise LLC. All rights reserved. All contributions from Caffe: Copyright(c) 2013, 2014, 2015, the respective contributors All rights reserved. All other contributions: Copyright(c) 2015, 2016 the respective contributors All rights reserved. Caffe2 uses a copyright model similar to Caffe: each contributor holds copyright over their contributions to Caffe2. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America and IDIAP Research Institute nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. NumPy (RandomKit 1.3) Copyright (c) 2003-2005, Jean-Sebastien Roy (js@jeannot.org) The rk_random and rk_seed functions algorithms and the original design of the Mersenne Twister RNG: Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Original algorithm for the implementation of rk_interval function from Richard J. Wagner's implementation of the Mersenne Twister RNG, optimised by Magnus Jonsson. Constants used in the rk_double implementation by Isaku Wada. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies within all project spaces, and it also applies when an individual is representing the project or its community in public spaces. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html) ================================================ FILE: CONTRIBUTING.md ================================================ # Contribution Guide Thank you for your interest in contributing to Core ML Stable Diffusion! This project was released for system demonstration purposes and there are limited plans for future development of the repository. While we welcome new pull requests and issues please note that our response may be limited. ## Submitting a Pull Request The project is licensed under the MIT license. By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the MIT license. ## Code of Conduct We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md). ================================================ FILE: LICENSE.md ================================================ MIT License Copyright (c) 2024 Apple Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Package.swift ================================================ // swift-tools-version: 5.8 // The swift-tools-version declares the minimum version of Swift required to build this package. import PackageDescription let package = Package( name: "stable-diffusion", platforms: [ .macOS(.v13), .iOS(.v16), ], products: [ .library( name: "StableDiffusion", targets: ["StableDiffusion"]), .executable( name: "StableDiffusionSample", targets: ["StableDiffusionCLI"]) ], dependencies: [ .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.2.3"), .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.8"), ], targets: [ .target( name: "StableDiffusion", dependencies: [ .product(name: "Transformers", package: "swift-transformers"), ], path: "swift/StableDiffusion"), .executableTarget( name: "StableDiffusionCLI", dependencies: [ "StableDiffusion", .product(name: "ArgumentParser", package: "swift-argument-parser")], path: "swift/StableDiffusionCLI"), .testTarget( name: "StableDiffusionTests", dependencies: ["StableDiffusion"], path: "swift/StableDiffusionTests", resources: [ .copy("Resources/vocab.json"), .copy("Resources/merges.txt") ]), ] ) ================================================ FILE: README.md ================================================ # Core ML Stable Diffusion Run Stable Diffusion on Apple Silicon with Core ML [\[Blog Post\]](https://machinelearning.apple.com/research/stable-diffusion-coreml-apple-silicon) [\[BibTeX\]](#bibtex) This repository comprises: - `python_coreml_stable_diffusion`, a Python package for converting PyTorch models to Core ML format and performing image generation with Hugging Face [diffusers](https://github.com/huggingface/diffusers) in Python - `StableDiffusion`, a Swift package that developers can add to their Xcode projects as a dependency to deploy image generation capabilities in their apps. The Swift package relies on the Core ML model files generated by `python_coreml_stable_diffusion` If you run into issues during installation or runtime, please refer to the [FAQ](#faq) section. Please refer to the [System Requirements](#system-requirements) section before getting started. ## System Requirements
Details (Click to expand) Model Conversion: macOS | Python | coremltools | :------:|:------:|:-----------:| 13.1 | 3.8 | 7.0 | Project Build: macOS | Xcode | Swift | :------:|:-----:|:-----:| 13.1 | 14.3 | 5.8 | Target Device Runtime: macOS | iPadOS, iOS | :------:|:-----------:| 13.1 | 16.2 | Target Device Runtime ([With Memory Improvements](#compression-6-bits-and-higher)): macOS | iPadOS, iOS | :------:|:-----------:| 14.0 | 17.0 | Target Device Hardware Generation: Mac | iPad | iPhone | :------:|:-------:|:-------:| M1 | M1 | A14 |
## Performance Benchmarks
Details (Click to expand) [`stabilityai/stable-diffusion-2-1-base`](https://huggingface.co/apple/coreml-stable-diffusion-2-1-base) (512x512) | Device | `--compute-unit`| `--attention-implementation` | End-to-End Latency (s) | Diffusion Speed (iter/s) | | --------------------- | --------------- | ---------------------------- | ---------------------- | ------------------------ | | iPhone 12 Mini | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 18.5* | 1.44 | | iPhone 12 Pro Max | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 15.4 | 1.45 | | iPhone 13 | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 10.8* | 2.53 | | iPhone 13 Pro Max | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 10.4 | 2.55 | | iPhone 14 | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 8.6 | 2.57 | | iPhone 14 Pro Max | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 7.9 | 2.69 | | iPad Pro (M1) | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 11.2 | 2.19 | | iPad Pro (M2) | `CPU_AND_NE` | `SPLIT_EINSUM_V2` | 7.0 | 3.07 |
Details (Click to expand) - This benchmark was conducted by Apple and Hugging Face using public beta versions of iOS 17.0, iPadOS 17.0 and macOS 14.0 Seed 8 in August 2023. - The performance data was collected using the `benchmark` branch of the [Diffusers app](https://github.com/huggingface/swift-coreml-diffusers) - Swift code is not fully optimized, introducing up to ~10% overhead unrelated to Core ML model execution. - The median latency value across 5 back-to-back end-to-end executions are reported - The image generation procedure follows the standard configuration: 20 inference steps, 512x512 output image resolution, 77 text token sequence length, classifier-free guidance (batch size of 2 for unet). - The actual prompt length does not impact performance because the Core ML model is converted with a static shape that computes the forward pass for all of the 77 elements (`tokenizer.model_max_length`) in the text token sequence regardless of the actual length of the input text. - Weights are compressed to 6 bit precision. Please refer to [this section](#compression-6-bits-and-higher) for details. - Activations are in float16 precision for both the GPU and the Neural Engine. - `*` indicates that the [reduceMemory](https://github.com/apple/ml-stable-diffusion/blob/main/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift#L91) option was enabled which loads and unloads models just-in-time to avoid memory shortage. This added up to 2 seconds to the end-to-end latency. - In the benchmark table, we report the best performing `--compute-unit` and `--attention-implementation` values per device. The former does not modify the Core ML model and can be applied during runtime. The latter modifies the Core ML model. Note that the best performing compute unit is model version and hardware-specific. - Note that the performance optimizations in this repository (e.g. `--attention-implementation`) are generally applicable to Transformers and not customized to Stable Diffusion. Better performance may be observed upon custom kernel tuning. Therefore, these numbers do not represent **peak** HW capability. - Performance may vary across different versions of Stable Diffusion due to architecture changes in the model itself. Each reported number is specific to the model version mentioned in that context. - Performance may vary due to factors like increased system load from other applications or suboptimal device thermal state.
[`stabilityai/stable-diffusion-xl-base-1.0-ios`](https://huggingface.co/apple/coreml-stable-diffusion-xl-base-ios) (768x768) | Device | `--compute-unit`| `--attention-implementation` | End-to-End Latency (s) | Diffusion Speed (iter/s) | | --------------------- | --------------- | ---------------------------- | ---------------------- | ------------------------ | | iPhone 12 Pro | `CPU_AND_NE` | `SPLIT_EINSUM` | 116* | 0.50 | | iPhone 13 Pro Max | `CPU_AND_NE` | `SPLIT_EINSUM` | 86* | 0.68 | | iPhone 14 Pro Max | `CPU_AND_NE` | `SPLIT_EINSUM` | 77* | 0.83 | | iPhone 15 Pro Max | `CPU_AND_NE` | `SPLIT_EINSUM` | 31 | 0.85 | | iPad Pro (M1) | `CPU_AND_NE` | `SPLIT_EINSUM` | 36 | 0.69 | | iPad Pro (M2) | `CPU_AND_NE` | `SPLIT_EINSUM` | 27 | 0.98 |
Details (Click to expand) - This benchmark was conducted by Apple and Hugging Face using iOS 17.0.2 and iPadOS 17.0.2 in September 2023. - The performance data was collected using the `benchmark` branch of the [Diffusers app](https://github.com/huggingface/swift-coreml-diffusers) - The median latency value across 5 back-to-back end-to-end executions are reported - The image generation procedure follows this configuration: 20 inference steps, 768x768 output image resolution, 77 text token sequence length, classifier-free guidance (batch size of 2 for unet). - `Unet.mlmodelc` is compressed to 4.04 bit precision following the [Mixed-Bit Palettization](#compression-lower-than-6-bits) algorithm recipe published [here](https://huggingface.co/apple/coreml-stable-diffusion-mixed-bit-palettization/blob/main/recipes/stabilityai-stable-diffusion-xl-base-1.0_palettization_recipe.json) - All models except for `Unet.mlmodelc` are compressed to 16 bit precision - [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) by [@madebyollin](https://github.com/madebyollin) was used as the source PyTorch model for `VAEDecoder.mlmodelc` in order to enable float16 weight and activation quantization for the VAE model. - `--attention-implementation SPLIT_EINSUM` is chosen in lieu of `SPLIT_EINSUM_V2` due to the prohibitively long compilation time of the latter - `*` indicates that the [reduceMemory](https://github.com/apple/ml-stable-diffusion/blob/main/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift#L91) option was enabled which loads and unloads models just-in-time to avoid memory shortage. This added significant overhead to the end-to-end latency. Note that end-to-end latency difference between `iPad Pro (M1)` and `iPhone 13 Pro Max` despite identical diffusion speed. - The actual prompt length does not impact performance because the Core ML model is converted with a static shape that computes the forward pass for all of the 77 elements (`tokenizer.model_max_length`) in the text token sequence regardless of the actual length of the input text. - In the benchmark table, we report the best performing `--compute-unit` and `--attention-implementation` values per device. The former does not modify the Core ML model and can be applied during runtime. The latter modifies the Core ML model. Note that the best performing compute unit is model version and hardware-specific. - Note that the performance optimizations in this repository (e.g. `--attention-implementation`) are generally applicable to Transformers and not customized to Stable Diffusion. Better performance may be observed upon custom kernel tuning. Therefore, these numbers do not represent **peak** HW capability. - Performance may vary across different versions of Stable Diffusion due to architecture changes in the model itself. Each reported number is specific to the model version mentioned in that context. - Performance may vary due to factors like increased system load from other applications or suboptimal device thermal state.
[`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/apple/coreml-stable-diffusion-xl-base) (1024x1024) | Device | `--compute-unit`| `--attention-implementation` | End-to-End Latency (s) | Diffusion Speed (iter/s) | | --------------------- | --------------- | ---------------------------- | ---------------------- | ------------------------ | | MacBook Pro (M1 Max) | `CPU_AND_GPU` | `ORIGINAL` | 46 | 0.46 | | MacBook Pro (M2 Max) | `CPU_AND_GPU` | `ORIGINAL` | 37 | 0.57 | | Mac Studio (M1 Ultra) | `CPU_AND_GPU` | `ORIGINAL` | 25 | 0.89 | | Mac Studio (M2 Ultra) | `CPU_AND_GPU` | `ORIGINAL` | 20 | 1.11 |
Details (Click to expand) - This benchmark was conducted by Apple and Hugging Face using public beta versions of iOS 17.0, iPadOS 17.0 and macOS 14.0 in July 2023. - The performance data was collected by running the `StableDiffusion` Swift pipeline. - The median latency value across 3 back-to-back end-to-end executions are reported - The image generation procedure follows the standard configuration: 20 inference steps, 1024x1024 output image resolution, classifier-free guidance (batch size of 2 for unet). - Weights and activations are in float16 precision - Performance may vary across different versions of Stable Diffusion due to architecture changes in the model itself. Each reported number is specific to the model version mentioned in that context. - Performance may vary due to factors like increased system load from other applications or suboptimal device thermal state. Given these factors, we do not report sub-second variance in latency.
## Weight Compression (6-bits and higher)
Details (Click to expand) coremltools-7.0 supports advanced weight compression techniques for [pruning](https://coremltools.readme.io/v7.0/docs/pruning), [palettization](https://coremltools.readme.io/v7.0/docs/palettization-overview) and [linear 8-bit quantization](https://coremltools.readme.io/v7.0/docs/quantization-aware-training). For these techniques, `coremltools.optimize.torch.*` includes APIs that require fine-tuning to maintain accuracy at higher compression rates whereas `coremltools.optimize.coreml.*` includes APIs that are applied post-training and are data-free. We demonstrate how data-free [post-training palettization](https://coremltools.readme.io/v7.0/docs/post-training-palettization) implemented in `coremltools.optimize.coreml.palettize_weights` enables us to achieve greatly improved performance for Stable Diffusion on mobile devices. This API implements the [Fast Exact k-Means](https://arxiv.org/abs/1701.07204) algorithm for optimal weight clustering which yields more accurate palettes. Using `--quantize-nbits {2,4,6,8}` during [conversion](#converting-models-to-coreml) is going to apply this compression to the unet and text_encoder models. For best results, we recommend [training-time palettization](https://coremltools.readme.io/v7.0/docs/training-time-palettization): `coremltools.optimize.torch.palettization.DKMPalettizer` if fine-tuning your model is feasible. This API implements the [Differentiable k-Means (DKM)](https://machinelearning.apple.com/research/differentiable-k-means) learned palettization algorithm. In this exercise, we stick to post-training palettization for the sake of simplicity and ease of reproducibility. The Neural Engine is capable of accelerating models with low-bit palettization: 1, 2, 4, 6 or 8 bits. With iOS 17 and macOS 14, compressed weights for Core ML models can be just-in-time decompressed during runtime (as opposed to ahead-of-time decompression upon load) to match the precision of activation tensors. This yields significant memory savings and enables models to run on devices with smaller RAM (e.g. iPhone 12 Mini). In addition, compressed weights are faster to fetch from memory which reduces the latency of memory bandwidth-bound layers. The just-in-time decompression behavior depends on the compute unit, layer type and hardware generation. | Weight Precision | `--compute-unit` | [`stabilityai/stable-diffusion-2-1-base`](https://huggingface.co/apple/coreml-stable-diffusion-2-1-base) generating *"a high quality photo of a surfing dog"* | | :---------------:| :----------------: | ------------------------------------------------------ | | 6-bit | cpuAndNeuralEngine | | | 16-bit | cpuAndNeuralEngine | | | 16-bit | cpuAndGPU | | Note that there are minor differences across 16-bit (float16) and 6-bit results. These differences are comparable to the differences across float16 and float32 or differences across compute units as exemplified above. We recommend a minimum of 6 bits for palettizing Stable Diffusion. Smaller number of bits (1, 2 and 4) will require either fine-tuning or advanced palettization techniques such as [MBP](#compression-lower-than-6-bits). Resources: - [Core ML Tools Docs: Optimizing Models](https://coremltools.readme.io/v7.0/docs/optimizing-models) - [WWDC23 Session Video: Use Core ML Tools for machine learning model compression](https://developer.apple.com/videos/play/wwdc2023/10047)
## Advanced Weight Compression (Lower than 6-bits)
Details (Click to expand) This section describes an advanced compression algorithm called [Mixed-Bit Palettization (MBP)](https://huggingface.co/blog/stable-diffusion-xl-coreml#what-is-mixed-bit-palettization) built on top of the [Post-Training Weight Palettization tools](https://apple.github.io/coremltools/docs-guides/source/post-training-palettization.html) and using the [Weights Metadata API](https://apple.github.io/coremltools/docs-guides/source/mlmodel-utilities.html#get-weights-metadata) from [coremltools](https://github.com/apple/coremltools). MBP builds a per-layer "palettization recipe" by picking a suitable number of bits among the Neural Engine supported bit-widths of 1, 2, 4, 6 and 8 in order to achieve the minimum average bit-width while maintaining a desired level of signal strength. The signal strength is measured by comparing the compressed model's output to that of the original float16 model. Given the same random seed and text prompts, PSNR between denoised latents is computed. The compression rate will depend on the model version as well as the tolerance for signal loss (drop in PSNR) since this algorithm is adaptive. | 3.41-bit | 4.50-bit | 6.55-bit | 16-bit (original) | | :-------:| :-------:| :-------:| :----------------:| | | | | | For example, the original float16 [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) model has an ~82 dB signal strength. Naively applying [linear 8-bit quantization](https://coremltools.readme.io/docs/data-free-quantization) to the Unet model drops the signal to ~65 dB. Instead, applying MBP yields an average of 2.81-bits quantization while maintaining a signal strength of ~67 dB. This technique generally yields better results compared to using `--quantize-nbits` during model conversion but requires a "pre-analysis" run that takes up to a few hours on a single GPU (`mps` or `cuda`). Here is the signal strength (PSNR in dB) versus model size reduction (% of float16 size) for `stabilityai/stable-diffusion-xl-base-1.0`. The `{1,2,4,6,8}-bit` curves are generated by progressively palettizing more layers using a palette with fixed number of bits. The layers were ordered in ascending order of their isolated impact to end-to-end signal strength so the cumulative compression's impact is delayed as much as possible. The mixed-bit curve is based on falling back to a higher number of bits as soon as a layer's isolated impact to end-to-end signal integrity drops below a threshold. Note that all curves based on palettization outperform linear 8-bit quantization at the same model size except for 1-bit. Here are the steps for applying this technique on another model version: **Step 1:** Run the pre-analysis script to generate "recipes" with varying signal strength: ```python python -m python_coreml_stable_diffusion.mixed_bit_compression_pre_analysis --model-version -o ``` For popular base models, you may find the pre-computed pre-analysis results [here](https://huggingface.co/apple/coreml-stable-diffusion-mixed-bit-palettization/tree/main/recipes). Fine-tuned models models are likely to honor the recipes of their corresponding base models but this is untested. **Step 2:** The resulting JSON file from Step 1 will list "baselines", e.g.: ```json { "model_version": "stabilityai/stable-diffusion-xl-base-1.0", "baselines": { "original": 82.2, "linear_8bit": 66.025, "recipe_6.55_bit_mixedpalette": 79.9, "recipe_5.52_bit_mixedpalette": 78.2, "recipe_4.89_bit_mixedpalette": 76.8, "recipe_4.41_bit_mixedpalette": 75.5, "recipe_4.04_bit_mixedpalette": 73.2, "recipe_3.67_bit_mixedpalette": 72.2, "recipe_3.32_bit_mixedpalette": 71.4, "recipe_3.19_bit_mixedpalette": 70.4, "recipe_3.08_bit_mixedpalette": 69.6, "recipe_2.98_bit_mixedpalette": 68.6, "recipe_2.90_bit_mixedpalette": 67.8, "recipe_2.83_bit_mixedpalette": 67.0, "recipe_2.71_bit_mixedpalette": 66.3 }, } ``` Among these baselines, select a recipe based on your desired signal strength. We recommend palettizing to ~4 bits depending on the use case even if the signal integrity for lower bit values are higher than the linear 8-bit quantization baseline. Finally, apply the selected recipe to the float16 Core ML model as follows: ```python python -m python_coreml_stable_diffusion.mixed_bit_compression_apply --mlpackage-path -o --pre-analysis-json-path --selected-recipe ``` An example `` would be `"recipe_4.50_bit_mixedpalette"` which achieves an average of 4.50-bits compression (compressed from ~5.2GB to ~1.46GB for SDXL). Please note that signal strength does not directly map to image-text alignment. Always verify that your MBP-compressed model variant is accurately generating images for your test prompts.
## Activation Quantization
Details (Click to expand) On newer hardware with A17 Pro or M4 chips, such as the iPhone 15 Pro, quantizing both activations and weight to int8 can leverage optimized compute on the Neural Engine which can be used to improve runtime latency in compute-bound models. In this section, we demonstrate how to apply [Post Training Activation Quantization](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-algos.html#post-training-data-calibration-activation-quantization), using calibration data, on Stable Diffusion UNet model. Similar to Mixed-Bit Palettization (MBP) described [above](#a-namecompression-lower-than-6-bitsa-advanced-weight-compression-lower-than-6-bits), first, a per-layer analysis is run to determine which intermediate activations are more sensitive to 8-bit compression. Less sensitive layers are weight and activation quantized (W8A8), whereas more sensitive layers are only weight quantized (W8A16). Here are the steps for applying this technique: **Step 1:** Generate calibration data ```python python -m python_coreml_stable_diffusion.activation_quantization --model-version --generate-calibration-data -o ``` A set of calibration text prompts are run through StableDiffusionPipeline and UNet model inputs are recorded and stored as pickle files in `calibration_data_` folder inside specified output directory. **Step 2:** Run layer-wise sensitivity analysis ```python python -m python_coreml_stable_diffusion.activation_quantization --model-version --layerwise-sensitivity --calibration-nsamples -o ``` This will run the analysis on all Convolutional and Attention (Einsum) modules in the model. For each module, a compressed version is generated by quantizing only that layer’s weights and activations. Then the PSNR between the outputs of the compressed and original model is calculated, using the same random seed and text prompts. This analysis takes up to a few hours on a single GPU (cuda). The number of calibration samples used to quantize the model can be reduced to speed up the process. The resulting JSON file looks like this: ```json { "conv": { "conv_in": 30.74, "down_blocks.0.attentions.0.proj_in": 38.93, "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q": 48.15, "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k": 50.13, "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v": 45.70, "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0": 39.56, ... }, "einsum": { "down_blocks.0.attentions.0.transformer_blocks.0.attn1.einsum": 25.34, "down_blocks.0.attentions.0.transformer_blocks.0.attn2.einsum": 31.76, "down_blocks.0.attentions.1.transformer_blocks.0.attn1.einsum": 23.40, "down_blocks.0.attentions.1.transformer_blocks.0.attn2.einsum": 31.56, ... }, "model_version": "stabilityai/stable-diffusion-2-1-base" } ``` **Step 3:** Generate quantized model Using calibration data and layer-wise sensitivity the quantized CoreML model can be generated as follows: ```python python -m python_coreml_stable_diffusion.activation_quantization --model-version --quantize-pytorch --conv-psnr 38 --attn-psnr 26 -o ``` The PSNR thresholds determine which layers will be activation quantized. This number can be tuned to trade-off between output quality and inference latency.
## Using Stable Diffusion 3
Details (Click to expand) ### Model Conversion Stable Diffusion 3 uses some new and some old models to run. For the text encoders, the conversion can be done using a similar command as before with the `--sd3-version` flag. ```bash python -m python_coreml_stable_diffusion.torch2coreml --model-version stabilityai/stable-diffusion-3-medium --bundle-resources-for-swift-cli --convert-text-encoder --sd3-version -o ``` For the new models (MMDiT, a new VAE with 16 channels, and the T5 text encoder), there are a number of new CLI flags that utilize the [DiffusionKit](https://www.github.com/argmaxinc/DiffusionKit) repo: - `--sd3-version`: Indicates to the converter to treat this as a Stable Diffusion 3 model - `--convert-mmdit`: Convert the MMDiT model - `--convert-vae-decoder`: Convert the new VAE model (this will use the 16 channel version if --sd3-version is set) - `--include-t5`: Downloads and includes a pre-converted T5 text encoder in the conversion e.g.: ```bash python -m python_coreml_stable_diffusion.torch2coreml --model-version stabilityai/stable-diffusion-3-medium --bundle-resources-for-swift-cli --convert-vae-decoder --convert-mmdit --include-t5 --sd3-version -o ``` To convert the full pipeline with at 1024x1024 resolution, the following command may be used: ```bash python -m python_coreml_stable_diffusion.torch2coreml --model-version stabilityai/stable-diffusion-3-medium --bundle-resources-for-swift-cli --convert-text-encoder --convert-vae-decoder --convert-mmdit --include-t5 --sd3-version --latent-h 128 --latent-w 128 -o ``` Keep in mind that the MMDiT model is quite large and will require increasingly more memory and time to convert as the latent resolution increases. Also note that currently the MMDiT model requires fp32 and therefore only supports `CPU_AND_GPU` compute units and `ORIGINAL` attention implementation (the default for this pipeline). ### Swift Inference Swift inference for Stable Diffusion 3 is similar to the previous versions. The only difference is that the `--sd3` flag should be used to indicate that the model is a Stable Diffusion 3 model. ```bash swift run StableDiffusionSample --resource-path --output-path --compute-units cpuAndGPU --sd3 ```
## Using Stable Diffusion XL
Details (Click to expand) ### Model Conversion e.g.: ```bash python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --convert-vae-decoder --convert-text-encoder --xl-version --model-version stabilityai/stable-diffusion-xl-base-1.0 --refiner-version stabilityai/stable-diffusion-xl-refiner-1.0 --bundle-resources-for-swift-cli --attention-implementation {ORIGINAL,SPLIT_EINSUM} -o ``` - `--xl-version`: Additional argument to pass to the conversion script when specifying an XL model - `--refiner-version`: Additional argument to pass to the conversion script when specifying an XL refiner model, required for ["Ensemble of Expert Denoisers"](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#1-ensemble-of-expert-denoisers) inference. - `--attention-implementation`: `ORIGINAL` is recommended for `cpuAndGPU` for deployment on Mac - `--attention-implementation`: `SPLIT_EINSUM` is recommended for `cpuAndNeuralEngine` for deployment on iPhone & iPad - `--attention-implementation`: `SPLIT_EINSUM_V2` is not recommended for Stable Diffusion XL because of prohibitively long compilation time - **Tip:** Adding `--latent-h 96 --latent-w 96` is recommended for iOS and iPadOS deployment which leads to 768x768 generation as opposed to the default 1024x1024. - **Tip:** Due to known float16 overflow issues in the original Stable Diffusion XL VAE, [the model conversion script enforces float32 precision](https://github.com/apple/ml-stable-diffusion/blob/main/python_coreml_stable_diffusion/torch2coreml.py#L486). Using a custom VAE version such as [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) by [@madebyollin](https://github.com/madebyollin) via `--custom-vae-version madebyollin/sdxl-vae-fp16-fix` will restore the default float16 precision for VAE. ### Swift Inference ```bash swift run StableDiffusionSample --resource-path --output-path --compute-units {cpuAndGPU,cpuAndNeuralEngine} --xl ``` - Only the `base` model is required, `refiner` model is optional and will be used by default if provided in the resource directory - ControlNet for XL is not yet supported ### Python Inference ```bash python -m python_coreml_stable_diffusion.pipeline --prompt --compute-unit {CPU_AND_GPU,CPU_AND_NE} -o -i --model-version stabilityai/stable-diffusion-xl-base-1.0 ``` - `refiner` model is not yet supported - ControlNet for XL is not yet supported
## Using ControlNet
Details (Click to expand) Example results using the prompt *"a high quality photo of a surfing dog"* conditioned on the scribble (leftmost): [ControlNet](https://huggingface.co/lllyasviel/ControlNet) allows users to condition image generation with Stable Diffusion on signals such as edge maps, depth maps, segmentation maps, scribbles and pose. Thanks to [@ryu38's contribution](https://github.com/apple/ml-stable-diffusion/pull/153), both the Python CLI and the Swift package support ControlNet models. Please refer to [this section](#converting-models-to-coreml) for details on setting up Stable Diffusion with ControlNet. Note that ControlNet is not yet supported for Stable Diffusion XL.
## Using the System Multilingual Text Encoder
Details (Click to expand) With iOS 17 and macOS 14, `NaturalLanguage` framework introduced the [NLContextualEmbedding](https://developer.apple.com/documentation/naturallanguage/nlcontextualembedding) which provides Transformer-based textual embeddings for Latin (20 languages), Cyrillic (4 languages) and CJK (3 languages) scripts. The WWDC23 session titled [Explore Natural Language multilingual models](https://developer.apple.com/videos/play/wwdc2023/10042) demonstrated how this powerful new model can be used by developers to train downstream tasks such as multilingual image generation with Stable Diffusion. The code to reproduce this demo workflow is made available in this repository. There are several ways in which this workflow can be implemented. Here is an example: **Step 1:** Curate an image-text dataset with the desired languages. **Step 2:** Pre-compute the NLContextualEmbedding values and replace the text strings with these embedding vectors in your dataset. **Step 3:** Fine-tune a base model from Hugging Face Hub that is compatible with the [StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview) by using your new dataset and replacing the default text_encoder with your pre-computed NLContextualEmbedding values. **Step 4:** In order to be able to swap the text_encoder of a base model without training new layers, the base model's `text_encoder.hidden_size` must match that of NLContextualEmbedding. If it doesn't, you will need to train a linear projection layer to map between the two dimensionalities. After fine-tuning, this linear layer should be converted to CoreML as follows: ```shell python -m python_coreml_stable_diffusion.multilingual_projection --input-path --output-dir ``` The command above will yield a `MultilingualTextEncoderProjection.mlmodelc` file under `--output-dir` and this should be colocated with the rest of the Core ML model assets that were generated through `--bundle-resources-for-swift-cli`. **Step 5:** The multilingual system text encoder can now be invoked by setting `useMultilingualTextEncoder` to true when initializing a pipeline or setting `--use-multilingual-text-encoder` in the CLI. Note that the model assets are distributed over-the-air so the first invocation will trigger asset downloads which is less than 100MB. Resources: - [WWDC23 Session Video: Explore Natural Language multilingual models](https://developer.apple.com/videos/play/wwdc2023/10042) - [NLContextualEmbedding API Documentation](https://developer.apple.com/documentation/naturallanguage/nlcontextualembedding)
## Using Ready-made Core ML Models from Hugging Face Hub
Click to expand 🤗 Hugging Face ran the [conversion procedure](#converting-models-to-coreml) on the following models and made the Core ML weights publicly available on the Hub. If you would like to convert a version of Stable Diffusion that is not already available on the Hub, please refer to the [Converting Models to Core ML](#converting-models-to-coreml). * 6-bit quantized models (suitable for iOS 17 and macOS 14): - [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/apple/coreml-stable-diffusion-1-4-palettized) - [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/apple/coreml-stable-diffusion-v1-5-palettized) - [`stabilityai/stable-diffusion-2-base`](https://huggingface.co/apple/coreml-stable-diffusion-2-base-palettized) - [`stabilityai/stable-diffusion-2-1-base`](https://huggingface.co/apple/coreml-stable-diffusion-2-1-base-palettized) * Mixed-bit quantized models - [`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/apple/coreml-stable-diffusion-mixed-bit-palettization) - [`stabilityai/stable-diffusion-xl-base-1.0-ios`](https://huggingface.co/apple/coreml-stable-diffusion-xl-base-ios) * Uncompressed models: - [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/apple/coreml-stable-diffusion-v1-4) - [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/apple/coreml-stable-diffusion-v1-5) - [`stabilityai/stable-diffusion-2-base`](https://huggingface.co/apple/coreml-stable-diffusion-2-base) - [`stabilityai/stable-diffusion-2-1-base`](https://huggingface.co/apple/coreml-stable-diffusion-2-1-base) - [`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/apple/coreml-stable-diffusion-xl-base) - [`stabilityai/stable-diffusion-xl-{base+refiner}-1.0`](https://huggingface.co/apple/coreml-stable-diffusion-xl-base-with-refiner) - [`stabilityai/stable-diffusion-3-medium`](https://huggingface.co/stabilityai/stable-diffusion-3-medium) If you want to use any of those models you may download the weights and proceed to [generate images with Python](#image-generation-with-python) or [Swift](#image-generation-with-swift). There are several variants in each model repository. You may clone the whole repos using `git` and `git lfs` to download all variants, or selectively download the ones you need. To clone the repos using `git`, please follow this process: **Step 1:** Install the `git lfs` extension for your system. `git lfs` stores large files outside the main git repo, and it downloads them from the appropriate server after you clone or checkout. It is available in most package managers, check [the installation page](https://git-lfs.com) for details. **Step 2:** Enable `git lfs` by running this command once: ```bash git lfs install ``` **Step 3:** Use `git clone` to download a copy of the repo that includes all model variants. For Stable Diffusion version 1.4, you'd issue the following command in your terminal: ```bash git clone https://huggingface.co/apple/coreml-stable-diffusion-v1-4 ``` If you prefer to download specific variants instead of cloning the repos, you can use the `huggingface_hub` Python library. For example, to do generation in Python using the `ORIGINAL` attention implementation (read [this section](#converting-models-to-coreml) for details), you could use the following helper code: ```Python from huggingface_hub import snapshot_download from pathlib import Path repo_id = "apple/coreml-stable-diffusion-v1-4" variant = "original/packages" model_path = Path("./models") / (repo_id.split("/")[-1] + "_" + variant.replace("/", "_")) snapshot_download(repo_id, allow_patterns=f"{variant}/*", local_dir=model_path, local_dir_use_symlinks=False) print(f"Model downloaded at {model_path}") ``` `model_path` would be the path in your local filesystem where the checkpoint was saved. Please, refer to [this post](https://huggingface.co/blog/diffusers-coreml) for additional details.
## Converting Models to Core ML
Click to expand **Step 1:** Create a Python environment and install dependencies: ```bash conda create -n coreml_stable_diffusion python=3.8 -y conda activate coreml_stable_diffusion cd /path/to/cloned/ml-stable-diffusion/repository pip install -e . ``` **Step 2:** Log in to or register for your [Hugging Face account](https://huggingface.co), generate a [User Access Token](https://huggingface.co/settings/tokens) and use this token to set up Hugging Face API access by running `huggingface-cli login` in a Terminal window. **Step 3:** Navigate to the version of Stable Diffusion that you would like to use on [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion) and accept its Terms of Use. The default model version is [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4). The model version may be changed by the user as described in the next step. **Step 4:** Execute the following command from the Terminal to generate Core ML model files (`.mlpackage`) ```shell python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --convert-text-encoder --convert-vae-decoder --convert-safety-checker --model-version -o ``` **WARNING:** This command will download several GB worth of PyTorch checkpoints from Hugging Face. Please ensure that you are on Wi-Fi and have enough disk space. This generally takes 15-20 minutes on an M1 MacBook Pro. Upon successful execution, the 4 neural network models that comprise Stable Diffusion will have been converted from PyTorch to Core ML (`.mlpackage`) and saved into the specified ``. Some additional notable arguments: - `--model-version`: The model version name as published on the [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion) - `--refiner-version`: The refiner version name as published on the [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion). This is optional and if specified, this argument will convert and bundle the refiner unet alongside the model unet. - `--bundle-resources-for-swift-cli`: Compiles all 4 models and bundles them along with necessary resources for text tokenization into `/Resources` which should provided as input to the Swift package. This flag is not necessary for the diffusers-based Python pipeline. [However using these compiled models in Python will significantly speed up inference](https://apple.github.io/coremltools/docs-guides/source/model-prediction.html#why-use-a-compiled-model). - `--quantize-nbits`: Quantizes the weights of unet and text_encoder models down to 2, 4, 6 or 8 bits using a globally optimal k-means clustering algorithm. By default all models are weight-quantized to 16 bits even if this argument is not specified. Please refer to [this section](#compression-6-bits-and-higher for details and further guidance on weight compression. - `--chunk-unet`: Splits the Unet model in two approximately equal chunks (each with less than 1GB of weights) for mobile-friendly deployment. This is **required** for Neural Engine deployment on iOS and iPadOS if weights are not quantized to 6-bits or less (`--quantize-nbits {2,4,6}`). This is not required for macOS. Swift CLI is able to consume both the chunked and regular versions of the Unet model but prioritizes the former. Note that chunked unet is not compatible with the Python pipeline because Python pipeline is intended for macOS only. - `--attention-implementation`: Defaults to `SPLIT_EINSUM` which is the implementation described in [Deploying Transformers on the Apple Neural Engine](https://machinelearning.apple.com/research/neural-engine-transformers). `--attention-implementation SPLIT_EINSUM_V2` yields 10-30% improvement for mobile devices, still targeting the Neural Engine. `--attention-implementation ORIGINAL` will switch to an alternative implementation that should be used for CPU or GPU deployment on some Mac devices. Please refer to the [Performance Benchmark](#performance-benchmark) section for further guidance. - `--check-output-correctness`: Compares original PyTorch model's outputs to final Core ML model's outputs. This flag increases RAM consumption significantly so it is recommended only for debugging purposes. - `--convert-controlnet`: Converts ControlNet models specified after this option. This can also convert multiple models if you specify like `--convert-controlnet lllyasviel/sd-controlnet-mlsd lllyasviel/sd-controlnet-depth`. - `--unet-support-controlnet`: enables a converted UNet model to receive additional inputs from ControlNet. This is required for generating image with using ControlNet and saved with a different name, `*_control-unet.mlpackage`, distinct from normal UNet. On the other hand, this UNet model can not work without ControlNet. Please use normal UNet for just txt2img. - `--unet-batch-one`: use a batch size of one for the unet, this is needed if you do not want to do classifier free guidance, i.e. using a `guidance-scale` of less than one. - `--convert-vae-encoder`: not required for text-to-image applications. Required for image-to-image applications in order to map the input image to the latent space.
## Image Generation with Python
Click to expand Run text-to-image generation using the example Python pipeline based on [diffusers](https://github.com/huggingface/diffusers): ```shell python -m python_coreml_stable_diffusion.pipeline --prompt "a photo of an astronaut riding a horse on mars" -i -o --compute-unit ALL --seed 93 ``` Please refer to the help menu for all available arguments: `python -m python_coreml_stable_diffusion.pipeline -h`. Some notable arguments: - `-i`: Should point to the `-o` directory from Step 4 of [Converting Models to Core ML](#converting-models-to-coreml) section from above. If you specified `--bundle-resources-for-swift-cli` during conversion, then use the resulting `Resources` folder (which holds the compiled `.mlmodelc` files). [The compiled models load much faster after first use](https://apple.github.io/coremltools/docs-guides/source/model-prediction.html#why-use-a-compiled-model). - `--model-version`: If you overrode the default model version while converting models to Core ML, you will need to specify the same model version here. - `--compute-unit`: Note that the most performant compute unit for this particular implementation may differ across different hardware. `CPU_AND_GPU` or `CPU_AND_NE` may be faster than `ALL`. Please refer to the [Performance Benchmark](#performance-benchmark) section for further guidance. - `--scheduler`: If you would like to experiment with different schedulers, you may specify it here. For available options, please see the help menu. You may also specify a custom number of inference steps by `--num-inference-steps` which defaults to 50. - `--controlnet`: ControlNet models specified with this option are used in image generation. Use this option in the format `--controlnet lllyasviel/sd-controlnet-mlsd lllyasviel/sd-controlnet-depth` and make sure to use `--controlnet-inputs` in conjunction. - `--controlnet-inputs`: Image inputs corresponding to each ControlNet model. Please provide image paths in same order as models in `--controlnet`, for example: `--controlnet-inputs image_mlsd image_depth`. - `--unet-batch-one`: Do not batch unet predictions for the prompt and negative prompt. This requires the unet has been converted with a batch size of one, see `--unet-batch-one` option in conversion script.
## Image Generation with Swift
Click to expand ### Example CLI Usage ```shell swift run StableDiffusionSample "a photo of an astronaut riding a horse on mars" --resource-path /Resources/ --seed 93 --output-path ``` The output will be named based on the prompt and random seed: e.g. `/a_photo_of_an_astronaut_riding_a_horse_on_mars.93.final.png` Please use the `--help` flag to learn about batched generation and more. ### Example Library Usage ```swift import StableDiffusion ... let pipeline = try StableDiffusionPipeline(resourcesAt: resourceURL) pipeline.loadResources() let image = try pipeline.generateImages(prompt: prompt, seed: seed).first ``` On iOS, the `reduceMemory` option should be set to `true` when constructing `StableDiffusionPipeline` ### Swift Package Details This Swift package contains two products: - `StableDiffusion` library - `StableDiffusionSample` command-line tool Both of these products require the Core ML models and tokenization resources to be supplied. When specifying resources via a directory path that directory must contain the following: - `TextEncoder.mlmodelc` or `TextEncoder2.mlmodelc (text embedding model) - `Unet.mlmodelc` or `UnetChunk1.mlmodelc` & `UnetChunk2.mlmodelc` (denoising autoencoder model) - `VAEDecoder.mlmodelc` (image decoder model) - `vocab.json` (tokenizer vocabulary file) - `merges.text` (merges for byte pair encoding file) Optionally, for image2image, in-painting, or similar: - `VAEEncoder.mlmodelc` (image encoder model) Optionally, it may also include the safety checker model that some versions of Stable Diffusion include: - `SafetyChecker.mlmodelc` Optionally, for the SDXL refiner: - `UnetRefiner.mlmodelc` (refiner unet model) Optionally, for ControlNet: - `ControlledUNet.mlmodelc` or `ControlledUnetChunk1.mlmodelc` & `ControlledUnetChunk2.mlmodelc` (enabled to receive ControlNet values) - `controlnet/` (directory containing ControlNet models) - `LllyasvielSdControlnetMlsd.mlmodelc` (for example, from lllyasviel/sd-controlnet-mlsd) - `LllyasvielSdControlnetDepth.mlmodelc` (for example, from lllyasviel/sd-controlnet-depth) - Other models you converted Note that the chunked version of Unet is checked for first. Only if it is not present will the full `Unet.mlmodelc` be loaded. Chunking is required for iOS and iPadOS and not necessary for macOS.
## Example Swift App
Click to expand 🤗 Hugging Face created an [open-source demo app](https://github.com/huggingface/swift-coreml-diffusers) on top of this library. It's written in native Swift and Swift UI, and runs on macOS, iOS and iPadOS. You can use the code as a starting point for your app, or to see how to integrate this library in your own projects. Hugging Face has made the app [available in the Mac App Store](https://apps.apple.com/app/diffusers/id1666309574?mt=12).
## FAQ
Click to expand
Q1: ERROR: Failed building wheel for tokenizers or error: can't find Rust compiler A1: Please review this [potential solution](https://github.com/huggingface/transformers/issues/2831#issuecomment-592724471).
Q2: RuntimeError: {NSLocalizedDescription = "Error computing NN outputs." A2: There are many potential causes for this error. In this context, it is highly likely to be encountered when your system is under increased memory pressure from other applications. Reducing memory utilization of other applications is likely to help alleviate the issue.
Q3: My Mac has 8GB RAM and I am converting models to Core ML using the example command. The process is getting killed because of memory issues. How do I fix this issue? A3: In order to minimize the memory impact of the model conversion process, please execute the following command instead: ```bash python -m python_coreml_stable_diffusion.torch2coreml --convert-vae-encoder --model-version -o && \ python -m python_coreml_stable_diffusion.torch2coreml --convert-vae-decoder --model-version -o && \ python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --model-version -o && \ python -m python_coreml_stable_diffusion.torch2coreml --convert-text-encoder --model-version -o && \ python -m python_coreml_stable_diffusion.torch2coreml --convert-safety-checker --model-version -o && ``` If you need `--chunk-unet`, you may do so in yet another independent command which will reuse the previously exported Unet model and simply chunk it in place: ```bash python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --chunk-unet -o ```
Q4: My Mac has 8GB RAM, should image generation work on my machine? A4: Yes! Especially the `--compute-unit CPU_AND_NE` option should work under reasonable system load from other applications. Note that part of the [Example Results](#example-results) were generated using an M2 MacBook Air with 8GB RAM.
Q5: Every time I generate an image using the Python pipeline, loading all the Core ML models takes 2-3 minutes. Is this expected? A5: Both `.mlpackage` and `.mlmodelc` models are compiled (also known as "model preparation" in Core ML terms) upon first load when a specific compute unit is specified. `.mlpackage` does not cache this compiled asset so each model load retriggers this compilation which may take up to a few minutes. On the other hand, `.mlmodelc` files do cache this compiled asset and non-first load times are reduced to just a few seconds. In order to benefit from compilation caching, you may use the `.mlmodelc` assets instead of `.mlpackage` assets in both Swift (default) and Python (possible thanks to [@lopez-hector](https://github.com/lopez-hector)'s [contribution](https://github.com/apple/ml-stable-diffusion/commit/f3a212491cf531dd88493c89ad3d98d016db407f)) image generation pipelines.
Q6: I want to deploy StableDiffusion, the Swift package, in my mobile app. What should I be aware of? A6: The [Image Generation with Swift](#image-gen-swift) section describes the minimum SDK and OS versions as well as the device models supported by this package. We recommend carefully testing the package on the device with the least amount of RAM available among your deployment targets. The image generation process in `StableDiffusion` can yield over 2 GB of peak memory during runtime depending on the compute units selected. On iPadOS, we recommend using `.cpuAndNeuralEngine` in your configuration and the `reduceMemory` option when constructing a `StableDiffusionPipeline` to minimize memory pressure. If your app crashes during image generation, consider adding the [Increased Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) capability to inform the system that some of your app’s core features may perform better by exceeding the default app memory limit on supported devices. On iOS, depending on the iPhone model, Stable Diffusion model versions, selected compute units, system load and design of your app, this may still not be sufficient to keep your apps peak memory under the limit. Please remember, because the device shares memory between apps and iOS processes, one app using too much memory can compromise the user experience across the whole device. We **strongly recommend** compressing your models following the recipes in [Advanced Weight Compression (Lower than 6-bits)](#compression-lower-than-6-bits) for iOS deployment. This reduces the peak RAM usage by up to 75% (from 16-bit to 4-bit) while preserving model output quality.
Q7: How do I generate images with different resolutions using the same Core ML models? A7: The current version of `python_coreml_stable_diffusion` does not support single-model multi-resolution out of the box. However, developers may fork this project and leverage the [flexible shapes](https://coremltools.readme.io/docs/flexible-inputs) support from coremltools to extend the `torch2coreml` script by using `coremltools.EnumeratedShapes`. Note that, while the `text_encoder` is agnostic to the image resolution, the inputs and outputs of `vae_decoder` and `unet` models are dependent on the desired image resolution.
Q8: Are the Core ML and PyTorch generated images going to be identical? A8: If desired, the generated images across PyTorch and Core ML can be made approximately identical. However, it is not guaranteed by default. There are several factors that might lead to different images across PyTorch and Core ML: 1. Random Number Generator Behavior The main source of potentially different results across PyTorch and Core ML is the Random Number Generator ([RNG](https://en.wikipedia.org/wiki/Random_number_generation)) behavior. PyTorch and Numpy have different sources of randomness. `python_coreml_stable_diffusion` generally relies on Numpy for RNG (e.g. latents initialization) and `StableDiffusion` Swift Library reproduces this RNG behavior by default. However, PyTorch-based pipelines such as Hugging Face `diffusers` relies on PyTorch's RNG behavior. Thanks to @liuliu's [contributions](https://github.com/apple/ml-stable-diffusion/pull/124), one can match the PyTorch (CPU/GPU) RNG behavior in Swift by specifying `--rng torch/cuda` which selects the `torchRNG/cudaRNG` mode. 2. PyTorch *"Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds."* ([source](https://pytorch.org/docs/stable/notes/randomness.html#reproducibility)). 3. Model Function Drift During Conversion The difference in outputs across corresponding PyTorch and Core ML models is a potential cause. The signal integrity is tested during the conversion process (enabled via `--check-output-correctness` argument to `python_coreml_stable_diffusion.torch2coreml`) and it is verified to be above a minimum [PSNR](https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio) value as tested on random inputs. Note that this is simply a sanity check and does not guarantee this minimum PSNR across all possible inputs. Furthermore, the results are not guaranteed to be identical when executing the same Core ML models across different compute units. This is not expected to be a major source of difference as the sample visual results indicate in [this section](#compression-6-bits-and-higher). 4. Weights and Activations Data Type When quantizing models from float32 to lower-precision data types such as float16, the generated images are [known to vary slightly](https://lambdalabs.com/blog/inference-benchmark-stable-diffusion) in semantics even when using the same PyTorch model. Core ML models generated by coremltools have float16 weights and activations by default [unless explicitly overridden](https://github.com/apple/coremltools/blob/main/coremltools/converters/_converters_entry.py#L256). This is not expected to be a major source of difference.
Q9: The model files are very large, how do I avoid a large binary for my App? A9: The recommended option is to prompt the user to download these assets upon first launch of the app. This keeps the app binary size independent of the Core ML models being deployed. Disclosing the size of the download to the user is extremely important as there could be data charges or storage impact that the user might not be comfortable with.
Q10: `Could not initialize NNPACK! Reason: Unsupported hardware` A10: This warning is safe to ignore in the context of this repository.
Q11: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect A11: This warning is safe to ignore in the context of this repository.
Q12: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown A12: If this warning is printed right after zsh: killed python -m python_coreml_stable_diffusion.torch2coreml ... , then it is highly likely that your Mac has run out of memory while converting models to Core ML. Please see [Q3](#low-mem-conversion) from above for the solution.
## BibTeX Reference ```latex @misc{stable-diffusion-coreml-apple-silicon, title = {Stable Diffusion with Core ML on Apple Silicon}, author = {Atila Orhon and Michael Siracusa and Aseem Wadhwa}, year = {2022}, URL = {null} } ``` ================================================ FILE: python_coreml_stable_diffusion/__init__.py ================================================ from ._version import __version__ ================================================ FILE: python_coreml_stable_diffusion/_version.py ================================================ __version__ = "1.1.0" ================================================ FILE: python_coreml_stable_diffusion/activation_quantization.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # import logging import operator import torch logging.basicConfig() logger = logging.getLogger() logger.setLevel('INFO') import argparse import gc import json import os import pickle from copy import deepcopy import coremltools as ct import numpy as np from coremltools.optimize.torch.quantization import ( LinearQuantizer, LinearQuantizerConfig, ModuleLinearQuantizerConfig) from diffusers import StableDiffusionPipeline from tqdm import tqdm from python_coreml_stable_diffusion import attention from python_coreml_stable_diffusion import unet from python_coreml_stable_diffusion.layer_norm import LayerNormANE from python_coreml_stable_diffusion.torch2coreml import compute_psnr from python_coreml_stable_diffusion.unet import Einsum attention.SPLIT_SOFTMAX = True CALIBRATION_DATA = [ "image of a transparent tall glass with ice, fruits and mint, photograph, commercial, food, warm background, beautiful image, detailed", "picture of dimly lit living room, minimalist furniture, vaulted ceiling, huge room, floor to ceiling window with an ocean view, nighttime, 3D render, high quality, detailed", "modern office building, 8 stories tall, glass and steel, 3D render style, wide angle view, very detailed, sharp photographic image, in an office park, bright sunny day, clear blue skies, trees and landscaping", "cute small cat sitting in a movie theater eating popcorn, watching a movie, cozy indoor lighting, detailed, digital painting, character design", "a highly detailed matte painting of a man on a hill watching a rocket launch in the distance by studio ghibli, volumetric lighting, octane render, 4K resolution, hyperrealism, highly detailed, insanely detailed, cinematic lighting, depth of field", "an undersea world with several of fish, rocks, detailed, realistic, photograph, amazing, beautiful, high resolution", "large ocean wave hitting a beach at sunset, photograph, detailed", "pocket watch on a table, close up. macro, sharp, high gloss, brass, gears, sharp, detailed", "pocket watch in the style of pablo picasso, painting", "majestic royal tall ship on a calm sea, realistic painting, cloudy blue sky, in the style of edward hopper", "german castle on a mountain, blue sky, realistic, photograph, dramatic, wide angle view", "artificial intelligence, AI, concept art, blue line sketch", "a humanoid robot, concept art, 3D render, high quality, detailed", "donut with sprinkles and a cup of coffee on a wood table, detailed, photograph", "orchard at sunset, beautiful, photograph, great composition, detailed, realistic, HDR", "image of a map of a country, tattered, old, styled, illustration, for a video game style", "blue and green woven fibers, nano fiber material, detailed, concept art, micro photography", ] RANDOM_TEST_DATA = [ "a black and brown dog standing outside a door.", "a person on a motorcycle makes a turn on the track.", "inflatable boats sit on the arizona river, and on the bank", "a white cat sitting under a white umbrella", "black bear standing in a field of grass under a tree.", "a train that is parked on tracks and has graffiti writing on it, with a mountain range in the background.", "a cake inside of a pan sitting in an oven.", "a table with paper plates and flowers in a home", ] def get_coreml_inputs(sample_inputs): return [ ct.TensorType( name=k, shape=v.shape, dtype=v.numpy().dtype if isinstance(v, torch.Tensor) else v.dtype, ) for k, v in sample_inputs.items() ] def convert_to_coreml(torchscript_module, sample_inputs): logger.info("Converting model to CoreML..") coreml_model = ct.convert( torchscript_module, convert_to="mlprogram", minimum_deployment_target=ct.target.macOS14, inputs=get_coreml_inputs(sample_inputs), outputs=[ct.TensorType(name="noise_pred", dtype=np.float32)], compute_units=ct.ComputeUnit.ALL, skip_model_load=True, ) return coreml_model def unet_data_loader(data_dir, device='cpu', calibration_nsamples=None): """ Load calibration data from specified path. Limit number of samples to calibration_nsamples, if specified. """ dataloader = [] skip_load = False for file in sorted(os.listdir(data_dir)): if file.endswith('.pkl'): filepath = os.path.join(data_dir, file) with open(filepath, 'rb') as data: try: while not skip_load: unet_data = pickle.load(data) for input in unet_data: dataloader.append([x.to(torch.float).to(device) for x in input]) if calibration_nsamples: if len(dataloader) >= calibration_nsamples: skip_load = True break except EOFError: pass if skip_load: break logger.info(f"Total calibration samples: {len(dataloader)}") return dataloader def quantize_module_config(module_name): """ Generate quantization config to apply W8A8 quantization for specified module. Rest of the model is kept in FP32 precision. """ config = LinearQuantizerConfig( global_config=ModuleLinearQuantizerConfig( milestones=[0, 1000, 1000, 0], weight_dtype=torch.float32, activation_dtype=torch.float32, ), module_name_configs={ module_name: ModuleLinearQuantizerConfig( quantization_scheme="symmetric", milestones=[0, 1000, 1000, 0], ), }, ) return config def quantize_cumulative_config(skip_conv_layers, skip_einsum_layers): """ Generate quantization config to apply W8A8 quantization. Skipped layers are kept in W8A32 precision. """ logger.info(f"Skipping {len(skip_conv_layers)} conv layers and {len(skip_einsum_layers)} einsum layers") w8config = ModuleLinearQuantizerConfig( quantization_scheme="symmetric", milestones=[0, 1000, 1000, 0], activation_dtype=torch.float32) conv_modules_config = {name: w8config for name in skip_conv_layers} einsum_modules_config = {name: w8config for name in skip_einsum_layers} module_name_config = {} module_name_config.update(conv_modules_config) module_name_config.update(einsum_modules_config) config = LinearQuantizerConfig( global_config=ModuleLinearQuantizerConfig( quantization_scheme="symmetric", milestones=[0, 1000, 1000, 0], ), module_name_configs=module_name_config, module_type_configs={ torch.cat: None, torch.nn.GroupNorm: None, torch.nn.SiLU: None, torch.nn.functional.gelu: None, operator.add: None, }, ) return config def quantize(model, config, calibration_data): """ Apply post training activation quantization to specified model, using calibration data """ submodules = dict(model.named_modules(remove_duplicate=True)) layer_norm_modules = [key for key, val in submodules.items() if isinstance(val, LayerNormANE)] non_traceable_module_names = layer_norm_modules + [ "time_proj", "time_embedding", ] # Mark certain modules as non-traceable to make the UNet model fx traceable config.non_traceable_module_names = non_traceable_module_names config.preserved_attributes = ['config', 'device'] sample_input = calibration_data[0] quantizer = LinearQuantizer(model, config) logger.info("Preparing model for quantization") prepared_model = quantizer.prepare(example_inputs=(sample_input,)) prepared_model.eval() quantizer.step() logger.info("Calibrate") for idx, data in enumerate(calibration_data): logger.info(f"Calibration data sample: {idx}") prepared_model(*data) logger.info("Finalize model") quantized_model = quantizer.finalize() return quantized_model def get_quantizable_modules(unet): quantizable_modules = [] for name, module in unet.named_modules(): if len(list(module.children())) > 0: continue if type(module) == torch.nn.modules.conv.Conv2d: quantizable_modules.append(('conv', name)) if type(module) == Einsum: quantizable_modules.append(('einsum', name)) return quantizable_modules def recipe_overrides_for_inference_speedup(conv_layers, skipped_conv): """ Quantize the slowest conv layers, even if in skipped set based on PSNR, for good inference speedup """ for layer in conv_layers: if "up_blocks" in layer and "resnets" in layer and "conv1" in layer: if layer in skipped_conv: logger.info(f"removing {layer}") skipped_conv.remove(layer) if "upsamplers" in layer: if layer in skipped_conv: logger.info(f"removing {layer}") skipped_conv.remove(layer) def recipe_overrides_for_quality(conv_layers, skipped_conv): """ Do not quantize out projection layers to avoid quantizing outputs of preceding concat layers. Quantizing output of concat layers can lead to quality degradation, due to sharing of scales across concat inputs, which can have varied ranges. Since this is a constraint enforced during model conversion, it may not be captured in layer-wise PSNR analysis of PyTorch model. """ out_proj_layers = [layer for layer in conv_layers if "to_out" in layer] for layer in out_proj_layers: if layer not in skipped_conv: logger.info(f"adding {layer}") skipped_conv.add(layer) def register_input_log_hook(unet, inputs): """ Register forward pre hook to save model inputs """ def hook(_, input): input_copy = deepcopy(input) input_copy = tuple(i.to('cpu') for i in input_copy) inputs.append(input_copy) # Return inputs unmodified return input return unet.register_forward_pre_hook(hook) def generate_calibration_data(pipe, args, calibration_dir): # Register forward pre hook to record unet inputs unet_inputs = [] handle = register_input_log_hook(pipe.unet, unet_inputs) # If directory doesn't exist, create it os.makedirs(calibration_dir, exist_ok=True) # Run calibration prompts through the pipeline and # serialize recorded UNet model inputs for prompt in CALIBRATION_DATA: gen = torch.manual_seed(args.seed) # run forward pass pipe(prompt=prompt, generator=gen) # save unet inputs filename = "_".join(prompt.split(" ")) + "_" + str(args.seed) + ".pkl" filepath = os.path.join(calibration_dir, filename) with open(filepath, 'wb') as f: pickle.dump(unet_inputs, f) # clear unet_inputs.clear() handle.remove() def register_input_preprocessing_hook(pipe): """ Register forward pre hook to convert UNet inputs from HuggingFace StableDiffusionPipeline to match expected model inputs in UNet2DConditionModel defined in unet.py """ def hook(_, args, kwargs): sample = args[0] timestep = args[1] if len(timestep.shape) == 0: timestep = timestep[None] timestep = timestep.expand(sample.shape[0]) encoder_hidden_states = kwargs["encoder_hidden_states"] encoder_hidden_states = encoder_hidden_states.permute((0, 2, 1)).unsqueeze(2) modified_args = (sample, timestep, encoder_hidden_states) return (modified_args, {}) return pipe.unet.register_forward_pre_hook(hook, with_kwargs=True) def prepare_pipe(pipe, unet): """ Create a new pipeline from `pipe` with `unet` as the noise predictor """ new_pipe = deepcopy(pipe) unet.to(new_pipe.unet.device) new_pipe.unet = unet pre_hook_handle = register_input_preprocessing_hook(new_pipe) return new_pipe, pre_hook_handle def run_pipe(pipe): gen = torch.manual_seed(args.seed) kwargs = dict( prompt=RANDOM_TEST_DATA, output_type="latent", generator=gen, ) return np.array([latent.cpu().numpy() for latent in pipe(**kwargs).images]) def get_reference_pipeline(model_version): # Initialize pipe pipe = StableDiffusionPipeline.from_pretrained( model_version, use_safetensors=True, use_auth_token=True, ) DEFAULT_NUM_INFERENCE_STEPS = 50 pipe.scheduler.set_timesteps(DEFAULT_NUM_INFERENCE_STEPS) # Initialize reference unet unet_cls = unet.UNet2DConditionModel reference_unet = unet_cls(**pipe.unet.config).eval() reference_unet.load_state_dict(pipe.unet.state_dict()) # Initialize reference pipeline ref_pipe, _ = prepare_pipe(pipe, reference_unet) del pipe gc.collect() return ref_pipe def main(args): # Initialize reference pipeline ref_pipe = get_reference_pipeline(args.model_version) if torch.cuda.is_available(): device = "cuda" else: device = "cpu" logger.debug(f"Placing pipe in {device}") ref_pipe.to(device) # Generate baseline outputs ref_out = run_pipe(ref_pipe) # Setup artifact file paths os.makedirs(args.o, exist_ok=True) recipe_json_path = os.path.join(args.o, f"{args.model_version.replace('/', '_')}_quantization_recipe.json") calibration_dir = os.path.join(args.o, f"calibration_data_{args.model_version.replace('/', '_')}") # Generate calibration data if args.generate_calibration_data: generate_calibration_data(ref_pipe, args, calibration_dir) # Compute layer-wise PSNR if args.layerwise_sensitivity: logger.info("Compute Layer-wise PSNR") quantizable_modules = get_quantizable_modules(ref_pipe.unet) results = { 'conv': {}, 'einsum': {}, 'model_version': args.model_version } dataloader = unet_data_loader(calibration_dir, device, args.calibration_nsamples) for module_type, module_name in tqdm(quantizable_modules): logger.info(f"Quantizing UNet Layer: {module_name}") config = quantize_module_config(module_name) quantized_unet = quantize(ref_pipe.unet, config, dataloader) # Generate outputs from quantized model q_pipe, _ = prepare_pipe(ref_pipe, quantized_unet) test_out = run_pipe(q_pipe) psnr = [float(f"{compute_psnr(r, t):.1f}") for r, t in zip(ref_out, test_out)] logger.info(f"PSNR: {psnr}") avg_psnr = sum(psnr) / len(psnr) logger.info(f"AVG PSNR: {avg_psnr}") results[module_type][module_name] = avg_psnr del quantized_unet del q_pipe gc.collect() with open(recipe_json_path, 'w') as f: json.dump(results, f, indent=2) if args.quantize_pytorch: logger.info("Quantizing UNet PyTorch model") dataloader = unet_data_loader(calibration_dir, device, args.calibration_nsamples) with open(recipe_json_path, "r") as f: results = json.load(f) logger.info(f"Conv PSNR threshold: {args.conv_psnr}, Attn PSNR threshold: {args.attn_psnr}") skipped_conv = set([layer for layer, psnr in results['conv'].items() if psnr < args.conv_psnr]) skipped_einsum = set([layer for layer, psnr in results['einsum'].items() if psnr < args.attn_psnr]) # Apply some overrides on PSNR based recipe for inference and quality improvements # Users can disable these selectively based on specific targets recipe_overrides_for_inference_speedup(results['conv'].keys(), skipped_conv) recipe_overrides_for_quality(results['conv'].keys(), skipped_conv) config = quantize_cumulative_config(skipped_conv, skipped_einsum) quantized_unet = quantize(ref_pipe.unet, config, dataloader) # Generate outputs from quantized model q_pipe, handle = prepare_pipe(ref_pipe, quantized_unet) test_out = run_pipe(q_pipe) psnr = [float(f"{compute_psnr(r, t):.1f}") for r, t in zip(ref_out, test_out)] logger.info(f"PSNR: {psnr}") avg_psnr = sum(psnr) / len(psnr) logger.info(f"AVG PSNR: {avg_psnr}") handle.remove() quantized_unet.to('cpu') sample_unet_input = { "sample": dataloader[0][0].to('cpu'), "timestep": dataloader[0][1].to('cpu'), "encoder_hidden_states": dataloader[0][2].to('cpu'), } logger.info("JIT tracing quantized model") traced_model = torch.jit.trace(quantized_unet, example_inputs=list(sample_unet_input.values())) logger.info("Converting to CoreML") coreml_sample_unet_input = { k: v.numpy().astype(np.float16) for k, v in sample_unet_input.items() } coreml_model = convert_to_coreml(traced_model, coreml_sample_unet_input) coreml_filename = f"Stable_Diffusion_version_{args.model_version.replace('/', '_')}_unet.mlpackage" coreml_model.save(os.path.join(args.o, coreml_filename)) del q_pipe del ref_pipe gc.collect() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-o", required=True, help="Output directory to save calibration data and quantization artifacts" ) parser.add_argument( "--model-version", required=True, choices=("runwayml/stable-diffusion-v1-5", "stabilityai/stable-diffusion-2-1-base"), help= ("The pre-trained model checkpoint and configuration to restore" )) parser.add_argument( "--generate-calibration-data", action="store_true", help="Generate calibration data for UNet model" ) parser.add_argument( "--layerwise-sensitivity", action="store_true", help="Compute compression sensitivity per-layer, by quantizing one layer at a time" ) parser.add_argument( "--quantize-pytorch", action="store_true", help="Generate activation quantized UNet model by quantizing layers above specified PSNR threshold" ) parser.add_argument( "--calibration-nsamples", type=int, help="Number of samples to use for calibrating UNet model" ) parser.add_argument("--seed", "-s", default=50, type=int, help="Random seed to be able to reproduce results" ) parser.add_argument("--conv-psnr", default=40.0, type=float, help="PSNR threshold for convolutional layers (default for stabilityai/stable-diffusion-2-1-base)" ) parser.add_argument("--attn-psnr", default=30.0, type=float, help="PSNR threshold for attention (Einsum) layers (default for stabilityai/stable-diffusion-2-1-base)" ) args = parser.parse_args() main(args) ================================================ FILE: python_coreml_stable_diffusion/attention.py ================================================ import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) import torch import math SPLIT_SOFTMAX = False def softmax(x, dim): # Reduction max max_x = x.max(dim=dim, keepdim=True).values # EW sub x -= max_x # Scale for EXP to EXP2, Activation EXP2 scaled_x = x * (1 / math.log(2)) exp_act = torch.exp2(scaled_x) # Reduction Sum + Inv exp_sum_inv = 1 / exp_act.sum(dim=dim, keepdims=True) # EW Mult return exp_act * exp_sum_inv def split_einsum(q, k, v, mask, heads, dim_head): """ Attention Implementation backing AttentionImplementations.SPLIT_EINSUM - Implements https://machinelearning.apple.com/research/neural-engine-transformers - Recommended for ANE - Marginally slower on GPU """ mh_q = [ q[:, head_idx * dim_head:(head_idx + 1) * dim_head, :, :] for head_idx in range(heads) ] # (bs, dim_head, 1, max_seq_length) * heads k = k.transpose(1, 3) mh_k = [ k[:, :, :, head_idx * dim_head:(head_idx + 1) * dim_head] for head_idx in range(heads) ] # (bs, max_seq_length, 1, dim_head) * heads mh_v = [ v[:, head_idx * dim_head:(head_idx + 1) * dim_head, :, :] for head_idx in range(heads) ] # (bs, dim_head, 1, max_seq_length) * heads attn_weights = [ torch.einsum("bchq,bkhc->bkhq", [qi, ki]) * (dim_head**-0.5) for qi, ki in zip(mh_q, mh_k) ] # (bs, max_seq_length, 1, max_seq_length) * heads if mask is not None: for head_idx in range(heads): attn_weights[head_idx] = attn_weights[head_idx] + mask if SPLIT_SOFTMAX: attn_weights = [ softmax(aw, dim=1) for aw in attn_weights ] # (bs, max_seq_length, 1, max_seq_length) * heads else: attn_weights = [ aw.softmax(dim=1) for aw in attn_weights ] # (bs, max_seq_length, 1, max_seq_length) * heads attn = [ torch.einsum("bkhq,bchk->bchq", wi, vi) for wi, vi in zip(attn_weights, mh_v) ] # (bs, dim_head, 1, max_seq_length) * heads attn = torch.cat(attn, dim=1) # (bs, dim, 1, max_seq_length) return attn CHUNK_SIZE = 512 def split_einsum_v2(q, k, v, mask, heads, dim_head): """ Attention Implementation backing AttentionImplementations.SPLIT_EINSUM_V2 - Implements https://machinelearning.apple.com/research/neural-engine-transformers - Recommended for ANE - Marginally slower on GPU - Chunks the query sequence to avoid large intermediate tensors and improves ANE performance """ query_seq_length = q.size(3) num_chunks = query_seq_length // CHUNK_SIZE if num_chunks == 0: logger.info( "AttentionImplementations.SPLIT_EINSUM_V2: query sequence too short to chunk " f"({query_seq_length}<{CHUNK_SIZE}), fall back to AttentionImplementations.SPLIT_EINSUM (safe to ignore)") return split_einsum(q, k, v, mask, heads, dim_head) logger.info( "AttentionImplementations.SPLIT_EINSUM_V2: Splitting query sequence length of " f"{query_seq_length} into {num_chunks} chunks") mh_q = [ q[:, head_idx * dim_head:(head_idx + 1) * dim_head, :, :] for head_idx in range(heads) ] # (bs, dim_head, 1, max_seq_length) * heads # Chunk the query sequence for each head mh_q_chunked = [ [h_q[..., chunk_idx * CHUNK_SIZE:(chunk_idx + 1) * CHUNK_SIZE] for chunk_idx in range(num_chunks)] for h_q in mh_q ] # ((bs, dim_head, 1, QUERY_SEQ_CHUNK_SIZE) * num_chunks) * heads k = k.transpose(1, 3) mh_k = [ k[:, :, :, head_idx * dim_head:(head_idx + 1) * dim_head] for head_idx in range(heads) ] # (bs, max_seq_length, 1, dim_head) * heads mh_v = [ v[:, head_idx * dim_head:(head_idx + 1) * dim_head, :, :] for head_idx in range(heads) ] # (bs, dim_head, 1, max_seq_length) * heads attn_weights = [ [ torch.einsum("bchq,bkhc->bkhq", [qi_chunk, ki]) * (dim_head**-0.5) for qi_chunk in h_q_chunked ] for h_q_chunked, ki in zip(mh_q_chunked, mh_k) ] # ((bs, max_seq_length, 1, chunk_size) * num_chunks) * heads attn_weights = [ [aw_chunk.softmax(dim=1) for aw_chunk in aw_chunked] for aw_chunked in attn_weights ] # ((bs, max_seq_length, 1, chunk_size) * num_chunks) * heads attn = [ [ torch.einsum("bkhq,bchk->bchq", wi_chunk, vi) for wi_chunk in wi_chunked ] for wi_chunked, vi in zip(attn_weights, mh_v) ] # ((bs, dim_head, 1, chunk_size) * num_chunks) * heads attn = torch.cat([ torch.cat(attn_chunked, dim=3) for attn_chunked in attn ], dim=1) # (bs, dim, 1, max_seq_length) return attn def original(q, k, v, mask, heads, dim_head): """ Attention Implementation backing AttentionImplementations.ORIGINAL - Not recommended for ANE - Recommended for GPU """ bs = q.size(0) mh_q = q.view(bs, heads, dim_head, -1) mh_k = k.view(bs, heads, dim_head, -1) mh_v = v.view(bs, heads, dim_head, -1) attn_weights = torch.einsum("bhcq,bhck->bhqk", [mh_q, mh_k]) attn_weights.mul_(dim_head**-0.5) if mask is not None: attn_weights = attn_weights + mask attn_weights = attn_weights.softmax(dim=3) attn = torch.einsum("bhqk,bhck->bhcq", [attn_weights, mh_v]) attn = attn.contiguous().view(bs, heads * dim_head, 1, -1) return attn ================================================ FILE: python_coreml_stable_diffusion/chunk_mlprogram.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # import argparse from collections import OrderedDict import coremltools as ct from coremltools.converters.mil import Block, Program, Var from coremltools.converters.mil.frontend.milproto.load import load as _milproto_to_pymil from coremltools.converters.mil.mil import Builder as mb from coremltools.converters.mil.mil import Placeholder from coremltools.converters.mil.mil import types as types from coremltools.converters.mil.mil.passes.helper import block_context_manager from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY from coremltools.converters.mil.testing_utils import random_gen_input_feature_type import gc import logging logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) import numpy as np import os from python_coreml_stable_diffusion import torch2coreml import shutil import time def _verify_output_correctness_of_chunks(full_model, first_chunk_model=None, second_chunk_model=None, pipeline_model=None,): """ Verifies the end-to-end output correctness of full (original) model versus chunked models """ # Generate inputs for first chunk and full model input_dict = {} for input_desc in full_model._spec.description.input: input_dict[input_desc.name] = random_gen_input_feature_type(input_desc) # Generate outputs for full model outputs_from_full_model = full_model.predict(input_dict) if pipeline_model is not None: outputs_from_pipeline_model = pipeline_model.predict(input_dict) final_outputs = outputs_from_pipeline_model elif first_chunk_model is not None and second_chunk_model is not None: # Generate outputs for first chunk outputs_from_first_chunk_model = first_chunk_model.predict(input_dict) # Prepare inputs for second chunk model from first chunk's outputs and regular inputs second_chunk_input_dict = {} for input_desc in second_chunk_model._spec.description.input: if input_desc.name in outputs_from_first_chunk_model: second_chunk_input_dict[ input_desc.name] = outputs_from_first_chunk_model[ input_desc.name] else: second_chunk_input_dict[input_desc.name] = input_dict[ input_desc.name] # Generate output for second chunk model outputs_from_second_chunk_model = second_chunk_model.predict( second_chunk_input_dict) final_outputs = outputs_from_second_chunk_model else: raise ValueError # Verify correctness across all outputs from second chunk and full model for out_name in outputs_from_full_model.keys(): torch2coreml.report_correctness( original_outputs=outputs_from_full_model[out_name], final_outputs=final_outputs[out_name], log_prefix=f"{out_name}") def _load_prog_from_mlmodel(model): """ Load MIL Program from an MLModel """ model_spec = model.get_spec() start_ = time.time() logger.info( "Loading MLModel object into a MIL Program object (including the weights).." ) prog = _milproto_to_pymil( model_spec=model_spec, specification_version=model_spec.specificationVersion, file_weights_dir=model.weights_dir, ) logger.info(f"Program loaded in {time.time() - start_:.1f} seconds") return prog def _get_op_idx_split_location(prog: Program): """ Find the op that approximately bisects the graph as measure by weights size on each side """ main_block = prog.functions["main"] main_block.operations = list(main_block.operations) total_size_in_mb = 0 for op in main_block.operations: if op.op_type == "const" and isinstance(op.val.val, np.ndarray): size_in_mb = op.val.val.size * op.val.val.itemsize / (1024 * 1024) total_size_in_mb += size_in_mb half_size = total_size_in_mb / 2 # Find the first non const op (single child), where the total cumulative size exceeds # the half size for the first time cumulative_size_in_mb = 0 for op in main_block.operations: if op.op_type == "const" and isinstance(op.val.val, np.ndarray): size_in_mb = op.val.val.size * op.val.val.itemsize / (1024 * 1024) cumulative_size_in_mb += size_in_mb # Note: The condition "not op.op_type.startswith("const")" is to make sure that the # incision op is neither of type "const" nor "constexpr_*" ops that # are used to store compressed weights if (cumulative_size_in_mb > half_size and not op.op_type.startswith("const") and len(op.outputs) == 1 and len(op.outputs[0].child_ops) == 1): op_idx = main_block.operations.index(op) return op_idx, cumulative_size_in_mb, total_size_in_mb def _get_first_chunk_outputs(block, op_idx): # Get the list of all vars that go across from first program (all ops from 0 to op_idx (inclusive)) # to the second program (all ops from op_idx+1 till the end). These all vars need to be made the output # of the first program and the input of the second program boundary_vars = set() block.operations = list(block.operations) for i in range(op_idx + 1): op = block.operations[i] if not op.op_type.startswith("const"): for var in op.outputs: if var.val is None: # only consider non const vars for child_op in var.child_ops: child_op_idx = block.operations.index(child_op) if child_op_idx > op_idx: boundary_vars.add(var) return list(boundary_vars) @block_context_manager def _add_fp32_casts(block, boundary_vars): new_boundary_vars = [] for var in boundary_vars: if var.dtype != types.fp16: new_boundary_vars.append(var) else: fp32_var = mb.cast(x=var, dtype="fp32", name=var.name) new_boundary_vars.append(fp32_var) return new_boundary_vars def _make_first_chunk_prog(prog, op_idx): """ Build first chunk by declaring early outputs and removing unused subgraph """ block = prog.functions["main"] boundary_vars = _get_first_chunk_outputs(block, op_idx) # Due to possible numerical issues, cast any fp16 var to fp32 new_boundary_vars = _add_fp32_casts(block, boundary_vars) block.outputs.clear() block.set_outputs(new_boundary_vars) PASS_REGISTRY["common::dead_code_elimination"](prog) return prog def _make_second_chunk_prog(prog, op_idx): """ Build second chunk by rebuilding a pristine MIL Program from MLModel """ block = prog.functions["main"] block.opset_version = ct.target.iOS16 # First chunk outputs are second chunk inputs (e.g. skip connections) boundary_vars = _get_first_chunk_outputs(block, op_idx) # This op will not be included in this program. Its output var will be made into an input block.operations = list(block.operations) boundary_op = block.operations[op_idx] # Add all boundary ops as inputs with block: for var in boundary_vars: new_placeholder = Placeholder( sym_shape=var.shape, dtype=var.dtype if var.dtype != types.fp16 else types.fp32, name=var.name, ) block._input_dict[ new_placeholder.outputs[0].name] = new_placeholder.outputs[0] block.function_inputs = tuple(block._input_dict.values()) new_var = None if var.dtype == types.fp16: new_var = mb.cast(x=new_placeholder.outputs[0], dtype="fp16", before_op=var.op) else: new_var = new_placeholder.outputs[0] block.replace_uses_of_var_after_op( anchor_op=boundary_op, old_var=var, new_var=new_var, # This is needed if the program contains "constexpr_*" ops. In normal cases, there are stricter # rules for removing them, and their presence may prevent replacing this var. # However in this case, since we want to remove all the ops in chunk 1, we can safely # set this to True. force_replace=True, ) PASS_REGISTRY["common::dead_code_elimination"](prog) # Remove any unused inputs new_input_dict = OrderedDict() for k, v in block._input_dict.items(): if len(v.child_ops) > 0: new_input_dict[k] = v block._input_dict = new_input_dict block.function_inputs = tuple(block._input_dict.values()) return prog def _legacy_model_chunking(args): # TODO: Remove this method after setting the coremltools dependency >= 8.0 os.makedirs(args.o, exist_ok=True) # Check filename extension mlpackage_name = os.path.basename(args.mlpackage_path) name, ext = os.path.splitext(mlpackage_name) assert ext == ".mlpackage", f"`--mlpackage-path` (args.mlpackage_path) is not an .mlpackage file" # Load CoreML model logger.info("Loading model from {}".format(args.mlpackage_path)) start_ = time.time() model = ct.models.MLModel( args.mlpackage_path, compute_units=ct.ComputeUnit.CPU_ONLY, ) logger.info( f"Loading {args.mlpackage_path} took {time.time() - start_:.1f} seconds" ) # Load the MIL Program from MLModel prog = _load_prog_from_mlmodel(model) # Compute the incision point by bisecting the program based on weights size op_idx, first_chunk_weights_size, total_weights_size = _get_op_idx_split_location( prog) main_block = prog.functions["main"] incision_op = main_block.operations[op_idx] logger.info(f"{args.mlpackage_path} will chunked into two pieces.") logger.info( f"The incision op: name={incision_op.name}, type={incision_op.op_type}, index={op_idx}/{len(main_block.operations)}" ) logger.info(f"First chunk size = {first_chunk_weights_size:.2f} MB") logger.info( f"Second chunk size = {total_weights_size - first_chunk_weights_size:.2f} MB" ) # Build first chunk (in-place modifies prog by declaring early exits and removing unused subgraph) prog_chunk1 = _make_first_chunk_prog(prog, op_idx) # Build the second chunk prog_chunk2 = _make_second_chunk_prog(_load_prog_from_mlmodel(model), op_idx) if not args.check_output_correctness: # Original model no longer needed in memory del model gc.collect() # Convert the MIL Program objects into MLModels logger.info("Converting the two programs") model_chunk1 = ct.convert( prog_chunk1, convert_to="mlprogram", compute_units=ct.ComputeUnit.CPU_ONLY, minimum_deployment_target=ct.target.iOS16, ) del prog_chunk1 gc.collect() logger.info("Conversion of first chunk done.") model_chunk2 = ct.convert( prog_chunk2, convert_to="mlprogram", compute_units=ct.ComputeUnit.CPU_ONLY, minimum_deployment_target=ct.target.iOS16, ) del prog_chunk2 gc.collect() logger.info("Conversion of second chunk done.") # Verify output correctness if args.check_output_correctness: logger.info("Verifying output correctness of chunks") _verify_output_correctness_of_chunks( full_model=model, first_chunk_model=model_chunk1, second_chunk_model=model_chunk2, ) if args.merge_chunks_in_pipeline_model: # Make a single pipeline model to manage the model chunks pipeline_model = ct.utils.make_pipeline(model_chunk1, model_chunk2) out_path_pipeline = os.path.join(args.o, name + "_chunked_pipeline.mlpackage") # Save and reload to ensure CPU placement pipeline_model.save(out_path_pipeline) pipeline_model = ct.models.MLModel(out_path_pipeline, compute_units=ct.ComputeUnit.CPU_ONLY) if args.check_output_correctness: logger.info("Verifying output correctness of pipeline model") _verify_output_correctness_of_chunks( full_model=model, pipeline_model=pipeline_model, ) else: # Save the chunked models to disk out_path_chunk1 = os.path.join(args.o, name + "_chunk1.mlpackage") out_path_chunk2 = os.path.join(args.o, name + "_chunk2.mlpackage") logger.info( f"Saved chunks in {args.o} with the suffix _chunk1.mlpackage and _chunk2.mlpackage" ) model_chunk1.save(out_path_chunk1) model_chunk2.save(out_path_chunk2) logger.info("Done.") def main(args): ct_version = ct.__version__ if ct_version != "8.0b2" and ct_version < "8.0": # With coremltools version <= 8.0b1, # we use the legacy implementation. # TODO: Remove the logic after setting the coremltools dependency >= 8.0. logger.info( f"coremltools version {ct_version} detected. Recommended upgrading the package version to " f"'8.0b2' when you running chunk_mlprogram.py script for the latest supports and bug fixes." ) _legacy_model_chunking(args) else: # Starting from coremltools==8.0b2, there is this `bisect_model` API that # we can directly call into. from coremltools.models.utils import bisect_model logger.info(f"Start chunking model {args.mlpackage_path} into two pieces.") ct.models.utils.bisect_model( model=args.mlpackage_path, output_dir=args.o, merge_chunks_to_pipeline=args.merge_chunks_in_pipeline_model, check_output_correctness=args.check_output_correctness, ) logger.info(f"Model chunking is done.") # Remove original (non-chunked) model if requested if args.remove_original: logger.info( "Removing original (non-chunked) model at {args.mlpackage_path}") shutil.rmtree(args.mlpackage_path) logger.info("Done.") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--mlpackage-path", required=True, help= "Path to the mlpackage file to be split into two mlpackages of approximately same file size.", ) parser.add_argument( "-o", required=True, help= "Path to output directory where the two model chunks should be saved.", ) parser.add_argument( "--remove-original", action="store_true", help= "If specified, removes the original (non-chunked) model to avoid duplicating storage." ) parser.add_argument( "--check-output-correctness", action="store_true", help= ("If specified, compares the outputs of original Core ML model with that of pipelined CoreML model chunks and reports PSNR in dB. ", "Enabling this feature uses more memory. Disable it if your machine runs out of memory." )) parser.add_argument( "--merge-chunks-in-pipeline-model", action="store_true", help= ("If specified, model chunks are managed inside a single pipeline model for easier asset maintenance" )) args = parser.parse_args() main(args) ================================================ FILE: python_coreml_stable_diffusion/controlnet.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # from diffusers.configuration_utils import ConfigMixin, register_to_config from diffusers import ModelMixin import torch import torch.nn as nn import torch.nn.functional as F from .unet import Timesteps, TimestepEmbedding, get_down_block, UNetMidBlock2DCrossAttn, linear_to_conv2d_map class ControlNetConditioningEmbedding(nn.Module): def __init__( self, conditioning_embedding_channels, conditioning_channels=3, block_out_channels=(16, 32, 96, 256), ): super().__init__() self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) self.blocks = nn.ModuleList([]) for i in range(len(block_out_channels) - 1): channel_in = block_out_channels[i] channel_out = block_out_channels[i + 1] self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1)) self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2)) self.conv_out = nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1) def forward(self, conditioning): embedding = self.conv_in(conditioning) embedding = F.silu(embedding) for block in self.blocks: embedding = block(embedding) embedding = F.silu(embedding) embedding = self.conv_out(embedding) return embedding class ControlNetModel(ModelMixin, ConfigMixin): @register_to_config def __init__( self, in_channels=4, flip_sin_to_cos=True, freq_shift=0, down_block_types=( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), only_cross_attention=False, block_out_channels=(320, 640, 1280, 1280), layers_per_block=2, downsample_padding=1, mid_block_scale_factor=1, act_fn="silu", norm_num_groups=32, norm_eps=1e-5, cross_attention_dim=1280, transformer_layers_per_block=1, attention_head_dim=8, use_linear_projection=False, upcast_attention=False, resnet_time_scale_shift="default", conditioning_embedding_out_channels=(16, 32, 96, 256), **kwargs, ): super().__init__() # Check inputs if len(block_out_channels) != len(down_block_types): raise ValueError( f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." ) if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): raise ValueError( f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): raise ValueError( f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." ) self._register_load_state_dict_pre_hook(linear_to_conv2d_map) # input conv_in_kernel = 3 conv_in_padding = (conv_in_kernel - 1) // 2 self.conv_in = nn.Conv2d( in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding ) # time time_embed_dim = block_out_channels[0] * 4 self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] self.time_embedding = TimestepEmbedding( timestep_input_dim, time_embed_dim, ) # control net conditioning embedding self.controlnet_cond_embedding = ControlNetConditioningEmbedding( conditioning_embedding_channels=block_out_channels[0], block_out_channels=conditioning_embedding_out_channels, ) self.down_blocks = nn.ModuleList([]) self.controlnet_down_blocks = nn.ModuleList([]) if isinstance(only_cross_attention, bool): only_cross_attention = [only_cross_attention] * len(down_block_types) if isinstance(attention_head_dim, int): attention_head_dim = (attention_head_dim,) * len(down_block_types) if isinstance(transformer_layers_per_block, int): transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) # down output_channel = block_out_channels[0] controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) self.controlnet_down_blocks.append(controlnet_block) for i, down_block_type in enumerate(down_block_types): input_channel = output_channel output_channel = block_out_channels[i] is_final_block = i == len(block_out_channels) - 1 down_block = get_down_block( down_block_type, transformer_layers_per_block=transformer_layers_per_block[i], num_layers=layers_per_block, in_channels=input_channel, out_channels=output_channel, temb_channels=time_embed_dim, resnet_eps=norm_eps, resnet_act_fn=act_fn, cross_attention_dim=cross_attention_dim, attn_num_head_channels=attention_head_dim[i], downsample_padding=downsample_padding, add_downsample=not is_final_block, ) self.down_blocks.append(down_block) for _ in range(layers_per_block): controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) self.controlnet_down_blocks.append(controlnet_block) if not is_final_block: controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) self.controlnet_down_blocks.append(controlnet_block) # mid mid_block_channel = block_out_channels[-1] controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1) self.controlnet_mid_block = controlnet_block self.mid_block = UNetMidBlock2DCrossAttn( in_channels=mid_block_channel, temb_channels=time_embed_dim, resnet_eps=norm_eps, resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, resnet_time_scale_shift=resnet_time_scale_shift, cross_attention_dim=cross_attention_dim, attn_num_head_channels=attention_head_dim[-1], resnet_groups=norm_num_groups, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, ) def get_num_residuals(self): num_res = 2 # initial sample + mid block for down_block in self.down_blocks: num_res += len(down_block.resnets) if hasattr(down_block, "downsamplers") and down_block.downsamplers is not None: num_res += len(down_block.downsamplers) return num_res def forward( self, sample, timestep, encoder_hidden_states, controlnet_cond, ): # 1. time t_emb = self.time_proj(timestep) emb = self.time_embedding(t_emb) # 2. pre-process sample = self.conv_in(sample) controlnet_cond = self.controlnet_cond_embedding(controlnet_cond) sample += controlnet_cond # 3. down down_block_res_samples = (sample,) for downsample_block in self.down_blocks: if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None: sample, res_samples = downsample_block( hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states, ) else: sample, res_samples = downsample_block(hidden_states=sample, temb=emb) down_block_res_samples += res_samples # 4. mid if self.mid_block is not None: sample = self.mid_block( sample, emb, encoder_hidden_states=encoder_hidden_states, ) # 5. Control net blocks controlnet_down_block_res_samples = () for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks): down_block_res_sample = controlnet_block(down_block_res_sample) controlnet_down_block_res_samples += (down_block_res_sample,) down_block_res_samples = controlnet_down_block_res_samples mid_block_res_sample = self.controlnet_mid_block(sample) return down_block_res_samples, mid_block_res_sample ================================================ FILE: python_coreml_stable_diffusion/coreml_model.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # import coremltools as ct import logging import json logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) import numpy as np import os import time import subprocess import sys def _macos_version(): """ Returns macOS version as a tuple of integers. On non-Macs, returns an empty tuple. """ if sys.platform == "darwin": try: ver_str = subprocess.run(["sw_vers", "-productVersion"], stdout=subprocess.PIPE).stdout.decode('utf-8').strip('\n') return tuple([int(v) for v in ver_str.split(".")]) except: raise Exception("Unable to determine the macOS version") return () class CoreMLModel: """ Wrapper for running CoreML models using coremltools """ def __init__(self, model_path, compute_unit, sources='packages', optimization_hints=None): logger.info(f"Loading {model_path}") start = time.time() if sources == 'packages': assert os.path.exists(model_path) and model_path.endswith(".mlpackage") self.model = ct.models.MLModel( model_path, compute_units=ct.ComputeUnit[compute_unit], optimization_hints=optimization_hints, ) DTYPE_MAP = { 65552: np.float16, 65568: np.float32, 131104: np.int32, } self.expected_inputs = { input_tensor.name: { "shape": tuple(input_tensor.type.multiArrayType.shape), "dtype": DTYPE_MAP[input_tensor.type.multiArrayType.dataType], } for input_tensor in self.model._spec.description.input } elif sources == 'compiled': assert os.path.exists(model_path) and model_path.endswith(".mlmodelc") self.model = ct.models.CompiledMLModel( model_path, compute_units=ct.ComputeUnit[compute_unit], optimization_hints=optimization_hints, ) # Grab expected inputs from metadata.json with open(os.path.join(model_path, 'metadata.json'), 'r') as f: config = json.load(f)[0] self.expected_inputs = { input_tensor['name']: { "shape": tuple(eval(input_tensor['shape'])), "dtype": np.dtype(input_tensor['dataType'].lower()), } for input_tensor in config['inputSchema'] } else: raise ValueError(f'Expected `packages` or `compiled` for sources, received {sources}') load_time = time.time() - start logger.info(f"Done. Took {load_time:.1f} seconds.") if load_time > LOAD_TIME_INFO_MSG_TRIGGER: logger.info( "Loading a CoreML model through coremltools triggers compilation every time. " "The Swift package we provide uses precompiled Core ML models (.mlmodelc) to avoid compile-on-load." ) def _verify_inputs(self, **kwargs): for k, v in kwargs.items(): if k in self.expected_inputs: if not isinstance(v, np.ndarray): raise TypeError( f"Expected numpy.ndarray, got {v} for input: {k}") expected_dtype = self.expected_inputs[k]["dtype"] if not v.dtype == expected_dtype: raise TypeError( f"Expected dtype {expected_dtype}, got {v.dtype} for input: {k}" ) expected_shape = self.expected_inputs[k]["shape"] if not v.shape == expected_shape: raise TypeError( f"Expected shape {expected_shape}, got {v.shape} for input: {k}" ) else: raise ValueError(f"Received unexpected input kwarg: {k}") def __call__(self, **kwargs): self._verify_inputs(**kwargs) return self.model.predict(kwargs) LOAD_TIME_INFO_MSG_TRIGGER = 10 # seconds def get_resource_type(resources_dir: str) -> str: """ Detect resource type based on filepath extensions. returns: `packages`: for .mlpackage resources 'compiled`: for .mlmodelc resources """ directories = [f for f in os.listdir(resources_dir) if os.path.isdir(os.path.join(resources_dir, f))] # consider directories ending with extension extensions = set([os.path.splitext(e)[1] for e in directories if os.path.splitext(e)[1]]) # if one extension present we may be able to infer sources type if len(set(extensions)) == 1: extension = extensions.pop() else: raise ValueError(f'Multiple file extensions found at {resources_dir}.' f'Cannot infer resource type from contents.') if extension == '.mlpackage': sources = 'packages' elif extension == '.mlmodelc': sources = 'compiled' else: raise ValueError(f'Did not find .mlpackage or .mlmodelc at {resources_dir}') return sources def _load_mlpackage(submodule_name, mlpackages_dir, model_version, compute_unit, sources=None): """ Load Core ML (mlpackage) models from disk (As exported by torch2coreml.py) """ # if sources not provided, attempt to infer `packages` or `compiled` from the # resources directory if sources is None: sources = get_resource_type(mlpackages_dir) if sources == 'packages': logger.info(f"Loading {submodule_name} mlpackage") fname = f"Stable_Diffusion_version_{model_version}_{submodule_name}.mlpackage".replace( "/", "_") mlpackage_path = os.path.join(mlpackages_dir, fname) if not os.path.exists(mlpackage_path): raise FileNotFoundError( f"{submodule_name} CoreML model doesn't exist at {mlpackage_path}") elif sources == 'compiled': logger.info(f"Loading {submodule_name} mlmodelc") # FixMe: Submodule names and compiled resources names differ. Can change if names match in the future. submodule_names = ["text_encoder", "text_encoder_2", "unet", "vae_decoder", "vae_encoder", "safety_checker"] compiled_names = ['TextEncoder', 'TextEncoder2', 'Unet', 'VAEDecoder', 'VAEEncoder', 'SafetyChecker'] name_map = dict(zip(submodule_names, compiled_names)) cname = name_map[submodule_name] + '.mlmodelc' mlpackage_path = os.path.join(mlpackages_dir, cname) if not os.path.exists(mlpackage_path): raise FileNotFoundError( f"{submodule_name} CoreML model doesn't exist at {mlpackage_path}") # On macOS 15+, set fast prediction optimization hint for the unet. optimization_hints = None if submodule_name == "unet" and _macos_version() >= (15, 0): optimization_hints = {"specializationStrategy": ct.SpecializationStrategy.FastPrediction} return CoreMLModel(mlpackage_path, compute_unit, sources=sources, optimization_hints=optimization_hints) def _load_mlpackage_controlnet(mlpackages_dir, model_version, compute_unit): """ Load Core ML (mlpackage) models from disk (As exported by torch2coreml.py) """ model_name = model_version.replace("/", "_") logger.info(f"Loading controlnet_{model_name} mlpackage") fname = f"ControlNet_{model_name}.mlpackage" mlpackage_path = os.path.join(mlpackages_dir, fname) if not os.path.exists(mlpackage_path): raise FileNotFoundError( f"controlnet_{model_name} CoreML model doesn't exist at {mlpackage_path}") return CoreMLModel(mlpackage_path, compute_unit) def get_available_compute_units(): return tuple(cu for cu in ct.ComputeUnit._member_names_) ================================================ FILE: python_coreml_stable_diffusion/layer_norm.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # import torch import torch.nn as nn # Reference: https://github.com/apple/ml-ane-transformers/blob/main/ane_transformers/reference/layer_norm.py class LayerNormANE(nn.Module): """ LayerNorm optimized for Apple Neural Engine (ANE) execution Note: This layer only supports normalization over the final dim. It expects `num_channels` as an argument and not `normalized_shape` which is used by `torch.nn.LayerNorm`. """ def __init__(self, num_channels, clip_mag=None, eps=1e-5, elementwise_affine=True): """ Args: num_channels: Number of channels (C) where the expected input data format is BC1S. S stands for sequence length. clip_mag: Optional float value to use for clamping the input range before layer norm is applied. If specified, helps reduce risk of overflow. eps: Small value to avoid dividing by zero elementwise_affine: If true, adds learnable channel-wise shift (bias) and scale (weight) parameters """ super().__init__() # Principle 1: Picking the Right Data Format (machinelearning.apple.com/research/apple-neural-engine) self.expected_rank = len("BC1S") self.num_channels = num_channels self.eps = eps self.clip_mag = clip_mag self.elementwise_affine = elementwise_affine if self.elementwise_affine: self.weight = nn.Parameter(torch.Tensor(num_channels)) self.bias = nn.Parameter(torch.Tensor(num_channels)) self._reset_parameters() def _reset_parameters(self): if self.elementwise_affine: nn.init.ones_(self.weight) nn.init.zeros_(self.bias) def forward(self, inputs): input_rank = len(inputs.size()) # Principle 1: Picking the Right Data Format (machinelearning.apple.com/research/apple-neural-engine) # Migrate the data format from BSC to BC1S (most conducive to ANE) if input_rank == 3 and inputs.size(2) == self.num_channels: inputs = inputs.transpose(1, 2).unsqueeze(2) input_rank = len(inputs.size()) assert input_rank == self.expected_rank assert inputs.size(1) == self.num_channels if self.clip_mag is not None: inputs.clamp_(-self.clip_mag, self.clip_mag) channels_mean = inputs.mean(dim=1, keepdims=True) zero_mean = inputs - channels_mean zero_mean_sq = zero_mean * zero_mean denom = (zero_mean_sq.mean(dim=1, keepdims=True) + self.eps).rsqrt() out = zero_mean * denom if self.elementwise_affine: out = (out + self.bias.view(1, self.num_channels, 1, 1) ) * self.weight.view(1, self.num_channels, 1, 1) return out ================================================ FILE: python_coreml_stable_diffusion/mixed_bit_compression_apply.py ================================================ import argparse import gc import json import logging import os import coremltools as ct import coremltools.optimize.coreml as cto import numpy as np from python_coreml_stable_diffusion.torch2coreml import get_pipeline from python_coreml_stable_diffusion.mixed_bit_compression_pre_analysis import ( NBITS, PALETTIZE_MIN_SIZE as MIN_SIZE ) logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) def main(args): # Load Core ML model coreml_model = ct.models.MLModel(args.mlpackage_path, compute_units=ct.ComputeUnit.CPU_ONLY) logger.info(f"Loaded {args.mlpackage_path}") # Load palettization recipe with open(args.pre_analysis_json_path, 'r') as f: pre_analysis = json.load(f) if args.selected_recipe not in list(pre_analysis["recipes"]): raise KeyError( f"--selected-recipe ({args.selected_recipe}) not found in " f"--pre-analysis-json-path ({args.pre_analysis_json_path}). " f" Available recipes: {list(pre_analysis['recipes'])}" ) recipe = pre_analysis["recipes"][args.selected_recipe] assert all(nbits in NBITS + [16] for nbits in recipe.values()), \ f"Some nbits values in the recipe are illegal. Allowed values: {NBITS}" # Hash tensors to be able to match torch tensor names to mil tensors def get_tensor_hash(tensor): assert tensor.dtype == np.float16 return tensor.ravel()[0] + np.prod(tensor.shape) args.model_version = pre_analysis["model_version"] pipe = get_pipeline(args) torch_model = pipe.unet hashed_recipe = {} for torch_module_name, nbits in recipe.items(): tensor = [ tensor.cpu().numpy().astype(np.float16) for name,tensor in torch_model.named_parameters() if name == torch_module_name + '.weight' ][0] hashed_recipe[get_tensor_hash(tensor)] = nbits del pipe gc.collect() op_name_configs = {} weight_metadata = cto.get_weights_metadata(coreml_model, weight_threshold=MIN_SIZE) hashes = np.array(list(hashed_recipe)) for name, metadata in weight_metadata.items(): # Look up target bits for this weight tensor_hash = get_tensor_hash(metadata.val) pdist = np.abs(hashes - tensor_hash) assert(pdist.min() < 0.01) matched = pdist.argmin() target_nbits = hashed_recipe[hashes[matched]] if target_nbits == 16: continue op_name_configs[name] = cto.OpPalettizerConfig( mode="kmeans", nbits=target_nbits, weight_threshold=int(MIN_SIZE) ) config = ct.optimize.coreml.OptimizationConfig(op_name_configs=op_name_configs) coreml_model = ct.optimize.coreml.palettize_weights(coreml_model, config) coreml_model.save(args.o) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-o", required=True, help="Output directory to save the custom palettized model" ) parser.add_argument( "--mlpackage-path", required=True, help="Path to .mlpackage model to be palettized" ) parser.add_argument( "--pre-analysis-json-path", required=True, type=str, help=("The JSON file generated by mixed_bit_compression_pre_analysis.py" )) parser.add_argument( "--selected-recipe", required=True, type=str, help=("The string key into --pre-analysis-json-path's baselines dict" )) parser.add_argument( "--custom-vae-version", type=str, default=None, help= ("Custom VAE checkpoint to override the pipeline's built-in VAE. " "If specified, the specified VAE will be converted instead of the one associated to the `--model-version` checkpoint. " "No precision override is applied when using a custom VAE." )) args = parser.parse_args() if not os.path.exists(args.mlpackage_path): raise FileNotFoundError if not os.path.exists(args.pre_analysis_json_path): raise FileNotFoundError if not args.pre_analysis_json_path.endswith('.json'): raise ValueError("--recipe-json-path should end with '.json'") main(args) ================================================ FILE: python_coreml_stable_diffusion/mixed_bit_compression_pre_analysis.py ================================================ from collections import OrderedDict from copy import deepcopy from functools import partial import argparse import gc import json import logging logging.basicConfig() logger = logging.getLogger() logger.setLevel('INFO') import numpy as np import os from PIL import Image from python_coreml_stable_diffusion.torch2coreml import compute_psnr, get_pipeline import time import torch import torch.nn as nn import requests torch.set_grad_enabled(False) from tqdm import tqdm # Bit-widths the Neural Engine is capable of accelerating NBITS = [1, 2, 4, 6, 8] # Minimum number of elements in a weight tensor to be considered for palettization # (saves pre-analysis time) PALETTIZE_MIN_SIZE = 1e5 # Signal integrity is computed based on these 4 random prompts RANDOM_TEST_DATA = [ "a black and brown dog standing outside a door.", "a person on a motorcycle makes a turn on the track.", "inflatable boats sit on the arizona river, and on the bank", "a white cat sitting under a white umbrella", "black bear standing in a field of grass under a tree.", "a train that is parked on tracks and has graffiti writing on it, with a mountain range in the background.", "a cake inside of a pan sitting in an oven.", "a table with paper plates and flowers in a home", ] TEST_RESOLUTION = 768 RANDOM_TEST_IMAGE_DATA = [ Image.open( requests.get(path, stream=True).raw).convert("RGB").resize( (TEST_RESOLUTION, TEST_RESOLUTION), Image.LANCZOS ) for path in [ "http://farm1.staticflickr.com/106/298138827_19bb723252_z.jpg", "http://farm4.staticflickr.com/3772/9666116202_648cd752d6_z.jpg", "http://farm3.staticflickr.com/2238/2472574092_f5534bb2f7_z.jpg", "http://farm1.staticflickr.com/220/475442674_47d81fdc2c_z.jpg", "http://farm8.staticflickr.com/7231/7359341784_4c5358197f_z.jpg", "http://farm8.staticflickr.com/7283/8737653089_d0c77b8597_z.jpg", "http://farm3.staticflickr.com/2454/3989339438_2f32b76ebb_z.jpg", "http://farm1.staticflickr.com/34/123005230_13051344b1_z.jpg", ]] # Copied from https://github.com/apple/coremltools/blob/7.0b1/coremltools/optimize/coreml/_quantization_passes.py#L602 from coremltools.converters.mil.mil import types def fake_linear_quantize(val, axis=-1, mode='LINEAR', dtype=types.int8): from coremltools.optimize.coreml._quantization_passes import AffineQuantParams from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin val_dtype = val.dtype def _ensure_numerical_range_and_cast(val, low, high, np_dtype): ''' For some cases, the computed quantized data might exceed the data range. For instance, after rounding and addition, we might get `128` for the int8 quantization. This utility function ensures the val in the data range before doing the cast. ''' val = np.minimum(val, high) val = np.maximum(val, low) return val.astype(np_dtype) mode_dtype_to_range = { (types.int8, "LINEAR"): (-128, 127), (types.int8, "LINEAR_SYMMETRIC"): (-127, 127), (types.uint8, "LINEAR"): (0, 255), (types.uint8, "LINEAR_SYMMETRIC"): (0, 254), } if not isinstance(val, (np.ndarray, np.generic)): raise ValueError("Only numpy arrays are supported") params = AffineQuantParams() axes = tuple([i for i in range(len(val.shape)) if i != axis]) val_min = np.amin(val, axis=axes, keepdims=True) val_max = np.amax(val, axis=axes, keepdims=True) if mode == "LINEAR_SYMMETRIC": # For the linear_symmetric mode, the range is symmetrical to 0 max_abs = np.maximum(np.abs(val_min), np.abs(val_max)) val_min = -max_abs val_max = max_abs else: assert mode == "LINEAR" # For the linear mode, we need to make sure the data range contains `0` val_min = np.minimum(0.0, val_min) val_max = np.maximum(0.0, val_max) q_val_min, q_val_max = mode_dtype_to_range[(dtype, mode)] # Set the zero point to symmetric mode np_dtype = nptype_from_builtin(dtype) if mode == "LINEAR_SYMMETRIC": if dtype == types.int8: params.zero_point = (0 * np.ones(val_min.shape)).astype(np.int8) else: assert dtype == types.uint8 params.zero_point = (127 * np.ones(val_min.shape)).astype(np.uint8) else: assert mode == "LINEAR" params.zero_point = (q_val_min * val_max - q_val_max * val_min) / (val_max - val_min) params.zero_point = np.round(params.zero_point) params.zero_point = _ensure_numerical_range_and_cast(params.zero_point, q_val_min, q_val_max, np_dtype) # compute the params params.scale = (val_max - val_min) / (q_val_max - q_val_min) params.scale = params.scale.astype(val.dtype).squeeze() params.quantized_data = np.round( val * (q_val_max - q_val_min) / (val_max - val_min) ) params.quantized_data = (params.quantized_data + params.zero_point) params.quantized_data = _ensure_numerical_range_and_cast(params.quantized_data, q_val_min, q_val_max, np_dtype) params.zero_point = params.zero_point.squeeze() params.axis = axis return (params.quantized_data.astype(val_dtype) - params.zero_point.astype(val_dtype)) * params.scale # Copied from https://github.com/apple/coremltools/blob/7.0b1/coremltools/optimize/coreml/_quantization_passes.py#L423 def fake_palettize(module, nbits, in_ngroups=1, out_ngroups=1): """ Simulate weight palettization """ from coremltools.models.neural_network.quantization_utils import _get_kmeans_lookup_table_and_weight def compress_kmeans(val, nbits): lut, indices = _get_kmeans_lookup_table_and_weight(nbits, val) lut = lut.astype(val.dtype) indices = indices.astype(np.uint8) return lut, indices dtype = module.weight.data.dtype device = module.weight.data.device val = module.weight.data.cpu().numpy().astype(np.float16) if out_ngroups == 1 and in_ngroups == 1: lut, indices = compress_kmeans(val=val, nbits=nbits) module.weight.data = torch.from_numpy(lut[indices]).reshape(val.shape).to(dtype) elif out_ngroups > 1 and in_ngroups == 1: assert val.shape[0] % out_ngroups == 0 rvals = [ compress_kmeans(val=chunked_val, nbits=nbits) for chunked_val in np.split(val, out_ngroups, axis=0) ] shape = list(val.shape) shape[0] = shape[0] // out_ngroups module.weight.data = torch.cat([ torch.from_numpy(lut[indices]).reshape(shape) for lut,indices in rvals ], dim=0).to(dtype).to(device) elif in_ngroups > 1 and out_ngroups == 1: assert val.shape[1] % in_ngroups == 0 rvals = [ compress_kmeans(val=chunked_val, nbits=nbits) for chunked_val in np.split(val, in_ngroups, axis=1) ] shape = list(val.shape) shape[1] = shape[1] // in_ngroups module.weight.data = torch.cat([ torch.from_numpy(lut[indices]).reshape(shape) for lut,indices in rvals ], dim=1).to(dtype).to(device) else: raise ValueError(f"in_ngroups={in_ngroups} & out_ngroups={out_ngroups} is illegal!!!") return torch.from_numpy(val).to(dtype) def restore_weight(module, value): device = module.weight.data.device module.weight.data = value.to(device) def get_palettizable_modules(unet, min_size=PALETTIZE_MIN_SIZE): ret = [ (name, getattr(module, 'weight').data.numel()) for name, module in unet.named_modules() if isinstance(module, (nn.Linear, nn.Conv2d)) if hasattr(module, 'weight') and getattr(module, 'weight').data.numel() > min_size ] candidates, sizes = [[a for a,b in ret], [b for a,b in ret]] logger.info(f"{len(candidates)} candidate tensors with {sum(sizes)/1e6} M total params") return candidates, sizes def fake_int8_quantize(module): i = 0 for name, submodule in tqdm(module.named_modules()): if hasattr(submodule, 'weight'): i+=1 submodule.weight.data = torch.from_numpy( fake_linear_quantize(submodule.weight.data.numpy())) logger.info(f"{i} modules fake int8 quantized") return module def fake_nbits_palette(module, nbits): i = 0 for name, submodule in tqdm(module.named_modules()): if hasattr(submodule, 'weight'): i+=1 fake_palettize(submodule, nbits=nbits) logger.info(f"{i} modules fake {nbits}-bits palettized") return module def fake_palette_from_recipe(module, recipe): tot_bits = 0 tot_numel = 0 for name, submodule in tqdm(module.named_modules()): if hasattr(submodule, 'weight'): tot_numel += submodule.weight.numel() if name in recipe: nbits = recipe[name] assert nbits in NBITS + [16] tot_bits += submodule.weight.numel() * nbits if nbits == 16: continue fake_palettize(submodule, nbits=nbits) else: tot_bits += submodule.weight.numel() * 16 logger.info(f"Palettized to {tot_bits/tot_numel:.2f}-bits mixed palette ({tot_bits/8e6} MB) ") # Globally synced RNG state rng = torch.Generator() rng_state = rng.get_state() def run_pipe(pipe): if torch.backends.mps.is_available(): device = "mps" elif torch.cuda.is_available(): device = "cuda" else: device = "cpu" logger.debug(f"Placing pipe in {device}") global rng, rng_state rng.set_state(rng_state) kwargs = dict( prompt=RANDOM_TEST_DATA, negative_prompt=[""] * len(RANDOM_TEST_DATA), num_inference_steps=1, height=TEST_RESOLUTION, width=TEST_RESOLUTION, output_type="latent", generator=rng ) if "Img2Img" in pipe.__class__.__name__: kwargs["image"] = RANDOM_TEST_IMAGE_DATA kwargs.pop("height") kwargs.pop("width") # Run a single denoising step kwargs["num_inference_steps"] = 4 kwargs["strength"] = 0.25 return np.array([latent.cpu().numpy() for latent in pipe.to(device)(**kwargs).images]) def benchmark_signal_integrity(pipe, candidates, nbits, cumulative, in_ngroups=1, out_ngroups=1, ref_out=None, ): results = {} results['metadata'] = { 'nbits': nbits, 'out_ngroups': out_ngroups, 'in_ngroups': in_ngroups, 'cumulative': cumulative, } # If reference outputs are not provided, treat current pipe as reference if ref_out is None: ref_out = run_pipe(pipe) for candidate in tqdm(candidates): palettized = False for name, module in pipe.unet.named_modules(): if name == candidate: orig_weight = fake_palettize( module, nbits, out_ngroups=out_ngroups, in_ngroups=in_ngroups, ) palettized = True break if not palettized: raise KeyError(name) test_out = run_pipe(pipe) if not cumulative: restore_weight(module, orig_weight) results[candidate] = [ float(f"{compute_psnr(r,t):.1f}") for r,t in zip(ref_out, test_out) ] logger.info(f"{nbits}-bit: {candidate} = {results[candidate]}") return results def descending_psnr_order(results): if 'metadata' in results: results.pop('metadata') return OrderedDict(sorted(results.items(), key=lambda items: -sum(items[1]))) def simulate_quant_fn(ref_pipe, quantization_to_simulate): simulated_pipe = deepcopy(ref_pipe.to('cpu')) quantization_to_simulate(simulated_pipe.unet) simulated_out = run_pipe(simulated_pipe) del simulated_pipe gc.collect() ref_out = run_pipe(ref_pipe) simulated_psnr = sum([ float(f"{compute_psnr(r, t):.1f}") for r, t in zip(ref_out, simulated_out) ]) / len(ref_out) return simulated_out, simulated_psnr def build_recipe(results, sizes, psnr_threshold, default_nbits): stats = {'nbits': 0} recipe = {} for key in results[str(NBITS[0])]: if key == 'metadata': continue achieved_nbits = default_nbits for nbits in NBITS: avg_psnr = sum(results[str(nbits)][key])/len(RANDOM_TEST_DATA) if avg_psnr > psnr_threshold: achieved_nbits = nbits break recipe[key] = achieved_nbits stats['nbits'] += achieved_nbits * sizes[key] stats['size_mb'] = stats['nbits'] / (8*1e6) tot_size = sum(list(sizes.values())) stats['nbits'] /= tot_size return recipe, stats def plot(results, args): import matplotlib.pyplot as plt max_model_size = sum(results['cumulative'][str(NBITS[0])]['metadata']['sizes']) f, ax = plt.subplots(1, 1, figsize=(7, 5)) def compute_x_axis(sizes, nbits, default_nbits): max_compression_percent = (default_nbits - nbits) / default_nbits progress = np.cumsum(sizes) normalized_progress = progress / progress.max() return normalized_progress * max_compression_percent * 100 # Linear 8-bit baseline and the intercept points for mixed-bit recipes linear8bit_baseline = results['baselines']['linear_8bit'] # Mark the linear 8-bit baseline ax.plot( 8 / args.default_nbits * 100, linear8bit_baseline, 'bx', markersize=8, label="8-bit (linear quant)") # Plot the iso-dB line that matches the 8-bit baseline ax.plot([0,100], [linear8bit_baseline]*2, '--b') # Plot non-mixed-bit palettization curves for idx, nbits in enumerate(NBITS): size_keys = compute_x_axis(results['cumulative'][str(nbits)]['metadata']['sizes'], nbits, args.default_nbits) psnr = [ sum(v) / len(RANDOM_TEST_DATA) # avg psnr for k,v in results['cumulative'][str(nbits)].items() if k != 'metadata' ] ax.plot( size_keys, psnr, label=f"{nbits}-bit") # Plot mixed-bit results mixed_palettes = [ (float(spec.rsplit('_')[1]), psnr) for spec,psnr in results['baselines'].items() if 'recipe' in spec ] mixedbit_sizes = [100. * (1. - a[0] / args.default_nbits) for a in mixed_palettes] mixedbit_psnrs = [a[1] for a in mixed_palettes] ax.plot( mixedbit_sizes, mixedbit_psnrs, label="mixed-bit", ) ax.set_xlabel("Model Size Reduction (%)") ax.set_ylabel("Signal Integrity (PSNR in dB)") ax.set_title(args.model_version) ax.legend() f.savefig(os.path.join(args.o, f"{args.model_version.replace('/','_')}_psnr_vs_size.png")) def main(args): # Initialize pipe pipe = get_pipeline(args) # Preserve a pristine copy for reference outputs ref_pipe = deepcopy(pipe) if args.default_nbits != 16: logger.info(f"Palettizing unet to default {args.default_nbits}-bit") fake_nbits_palette(pipe.unet, args.default_nbits) logger.info("Done.") # Cache reference outputs ref_out = run_pipe(pipe) # Bookkeeping os.makedirs(args.o, exist_ok=True) results = { 'single_layer': {}, 'cumulative': {}, 'model_version': args.model_version, } json_name = f"{args.model_version.replace('/','-')}_palettization_recipe.json" candidates, sizes = get_palettizable_modules(pipe.unet) sizes_table = dict(zip(candidates, sizes)) if os.path.isfile(os.path.join(args.o, json_name)): with open(os.path.join(args.o, json_name), "r") as f: results = json.load(f) # Analyze uniform-precision palettization impact on signal integrity for nbits in NBITS: if str(nbits) not in results['single_layer']: # Measure the impact of palettization of each layer independently results['single_layer'][str(nbits)] = benchmark_signal_integrity( pipe, candidates, nbits, cumulative=False, ref_out=ref_out, ) with open(os.path.join(args.o, json_name), 'w') as f: json.dump(results, f, indent=2) # Measure the cumulative impact of palettization based on ascending individual impact computed earlier sorted_candidates = descending_psnr_order(results['single_layer'][str(nbits)]) if str(nbits) not in results['cumulative']: results['cumulative'][str(nbits)] = benchmark_signal_integrity( deepcopy(pipe), sorted_candidates, nbits, cumulative=True, ref_out=ref_out, ) results['cumulative'][str(nbits)]['metadata'].update({ 'candidates': list(sorted_candidates.keys()), 'sizes': [sizes_table[candidate] for candidate in sorted_candidates], }) with open(os.path.join(args.o, json_name), 'w') as f: json.dump(results, f, indent=2) # Generate uniform-quantization baselines results['baselines'] = { "original": simulate_quant_fn(ref_pipe, lambda x: x)[1], "linear_8bit": simulate_quant_fn(ref_pipe, fake_int8_quantize)[1], } with open(os.path.join(args.o, json_name), 'w') as f: json.dump(results, f, indent=2) # Generate mixed-bit recipes via decreasing PSNR thresholds results['recipes'] = {} recipe_psnr_thresholds = np.linspace( results['baselines']['original'] - 1, results['baselines']["linear_8bit"] + 5, args.num_recipes, ) for recipe_no, psnr_threshold in enumerate(recipe_psnr_thresholds): logger.info(f"Building recipe #{recipe_no}") recipe, stats = build_recipe( results['cumulative'], sizes_table, psnr_threshold, args.default_nbits, ) achieved_psnr = simulate_quant_fn(ref_pipe, lambda x: partial(fake_palette_from_recipe, recipe=recipe)(x))[1] logger.info( f"Recipe #{recipe_no}: {stats['nbits']:.2f}-bits @ per-layer {psnr_threshold} dB, " f"end-to-end {achieved_psnr} dB & " f"{stats['size_mb']:.2f} MB" ) # Save achieved PSNR and compressed size recipe_key = f"recipe_{stats['nbits']:.2f}_bit_mixedpalette" results['baselines'][recipe_key] = float(f"{achieved_psnr:.1f}") results['recipes'][recipe_key] = recipe with open(os.path.join(args.o, json_name), 'w') as f: json.dump(results, f, indent=2) # Plot model size vs signal integrity plot(results, args) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-o", required=True, help="Output directory to save the palettization artifacts (recipe json, PSNR plots etc.)" ) parser.add_argument( "--model-version", required=True, help= ("The pre-trained model checkpoint and configuration to restore. " "For available versions: https://huggingface.co/models?search=stable-diffusion" )) parser.add_argument( "--default-nbits", help="Default number of bits to use for palettization", choices=tuple(NBITS + [16]), default=16, type=int, ) parser.add_argument( "--num-recipes", help="Maximum number of recipes to generate (with decreasing model size and signal integrity)", default=7, type=int, ) parser.add_argument( "--custom-vae-version", type=str, default=None, help= ("Custom VAE checkpoint to override the pipeline's built-in VAE. " "If specified, the specified VAE will be converted instead of the one associated to the `--model-version` checkpoint. " "No precision override is applied when using a custom VAE." )) args = parser.parse_args() main(args) ================================================ FILE: python_coreml_stable_diffusion/multilingual_projection.py ================================================ from python_coreml_stable_diffusion.torch2coreml import _compile_coreml_model import argparse import coremltools as ct import numpy as np import os import torch import torch.nn as nn # TODO: Read these values off of the NLContextualEmbedding API to enforce dimensions and track API versioning MAX_SEQUENCE_LENGTH = 256 EMBED_DIM = 512 BATCH_SIZE = 1 def main(args): # Layer that was trained to map NLContextualEmbedding to your text_encoder.hidden_size dimensionality text_encoder_projection = torch.jit.load(args.input_path) # Prepare random inputs for tracing the network before conversion random_input = torch.randn(BATCH_SIZE, MAX_SEQUENCE_LENGTH, EMBED_DIM) # Create a class to bake in the reshape operations required to fit the existing model interface class TextEncoderProjection(nn.Module): def __init__(self, proj): super().__init__() self.proj = proj def forward(self, x): return self.proj(x).transpose(1, 2).unsqueeze(2) # BSC, BC1S # Trace the torch model text_encoder_projection = torch.jit.trace(TextEncoderProjection(text_encoder_projection), (random_input,)) # Convert the model to Core ML mlpackage_path = os.path.join(args.output_dir, "MultilingualTextEncoderProjection.mlpackage") ct.convert( text_encoder_projection, inputs=[ct.TensorType('nlcontextualembeddings_output', shape=(1, MAX_SEQUENCE_LENGTH, EMBED_DIM), dtype=np.float32)], outputs=[ct.TensorType('encoder_hidden_states', dtype=np.float32)], minimum_deployment_target=ct.target.macOS14, # NLContextualEmbedding minimum availability build convert_to='mlprogram', ).save() # Compile the model and save it under the specified directory _compile_coreml_model(mlpackage_path, args.output_dir, final_name="MultilingualTextEncoderProjection") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--input-path", help="Path to the torchscript file that contains the projection layer" ) parser.add_argument( "--output-dir", help="Output directory in which the Core ML model should be saved", ) args = parser.parse_args() main(args) ================================================ FILE: python_coreml_stable_diffusion/pipeline.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # import argparse from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, ) from diffusers.schedulers.scheduling_utils import SchedulerMixin import gc import inspect import logging logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) import numpy as np import os from python_coreml_stable_diffusion.coreml_model import ( CoreMLModel, _load_mlpackage, _load_mlpackage_controlnet, get_available_compute_units, ) import time import torch # Only used for `torch.from_tensor` in `pipe.scheduler.step()` from transformers import CLIPFeatureExtractor, CLIPTokenizer from typing import List, Optional, Union, Tuple from PIL import Image class CoreMLStableDiffusionPipeline(DiffusionPipeline): """ Core ML version of `diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline` """ def __init__( self, text_encoder: CoreMLModel, unet: CoreMLModel, vae_decoder: CoreMLModel, scheduler: Union[ DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler ], tokenizer: CLIPTokenizer, controlnet: Optional[List[CoreMLModel]], xl: Optional[bool] = False, force_zeros_for_empty_prompt: Optional[bool] = True, feature_extractor: Optional[CLIPFeatureExtractor] = None, safety_checker: Optional[CoreMLModel] = None, text_encoder_2: Optional[CoreMLModel] = None, tokenizer_2: Optional[CLIPTokenizer] = None ): super().__init__() # Register non-Core ML components of the pipeline similar to the original pipeline self.register_modules( tokenizer=tokenizer, scheduler=scheduler, feature_extractor=feature_extractor, ) if safety_checker is None: # Reproduce original warning: # https://github.com/huggingface/diffusers/blob/v0.9.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L119 logger.warning( f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" " results in services or applications open to the public. Both the diffusers team and Hugging Face" " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" " it only for use-cases that involve analyzing network behavior or auditing its results. For more" " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." ) self.xl = xl self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt # Register Core ML components of the pipeline self.safety_checker = safety_checker self.text_encoder = text_encoder self.text_encoder_2 = text_encoder_2 self.tokenizer_2 = tokenizer_2 self.unet = unet self.unet.in_channels = self.unet.expected_inputs["sample"]["shape"][1] self.controlnet = controlnet self.vae_decoder = vae_decoder VAE_DECODER_UPSAMPLE_FACTOR = 8 # In PyTorch, users can determine the tensor shapes dynamically by default # In CoreML, tensors have static shapes unless flexible shapes were used during export # See https://coremltools.readme.io/docs/flexible-inputs latent_h, latent_w = self.unet.expected_inputs["sample"]["shape"][2:] self.height = latent_h * VAE_DECODER_UPSAMPLE_FACTOR self.width = latent_w * VAE_DECODER_UPSAMPLE_FACTOR logger.info( f"Stable Diffusion configured to generate {self.height}x{self.width} images" ) def _encode_prompt(self, prompt, prompt_2: Optional[str] = None, do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, ): batch_size = len(prompt) if isinstance(prompt, list) else 1 if self.xl is True: prompts = [prompt, prompt_2] if prompt_2 is not None else [prompt, prompt] # refiner uses only one tokenizer and text encoder (tokenizer_2 and text_encoder_2) tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] text_encoders = [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [ self.text_encoder_2] hidden_state_key = 'hidden_embeds' else: prompts = [prompt] tokenizers = [self.tokenizer] text_encoders = [self.text_encoder] hidden_state_key = 'last_hidden_state' prompt_embeds_list = [] for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): text_inputs = tokenizer( prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="np", ) text_input_ids = text_inputs.input_ids # tokenize without max_length to catch any truncation untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.equal( text_input_ids, untruncated_ids ): removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1: -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" f" {tokenizer.model_max_length} tokens: {removed_text}" ) embeddings = text_encoder(input_ids=text_input_ids.astype(np.float32)) prompt_embeds_list.append(embeddings[hidden_state_key]) # We are only ALWAYS interested in the pooled output of the final text encoder if self.xl: pooled_prompt_embeds = embeddings['pooled_outputs'] prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) if do_classifier_free_guidance and negative_prompt is None and self.force_zeros_for_empty_prompt: negative_prompt_embeds = np.zeros_like(prompt_embeds) if self.xl: negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) elif do_classifier_free_guidance: negative_prompt = negative_prompt or "" negative_prompt_2 = negative_prompt_2 or negative_prompt # normalize str to list negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt negative_prompt_2 = ( batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2 ) uncond_tokens: List[str] if prompts is not None and type(prompts) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}." ) elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" " the batch size of `prompt`.") else: uncond_tokens = [negative_prompt, negative_prompt_2] negative_prompt_embeds_list = [] for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): max_length = prompt_embeds.shape[1] uncond_input = tokenizer( negative_prompt, padding="max_length", max_length=max_length, truncation=True, return_tensors="np", ) uncond_input_ids = uncond_input.input_ids negative_embeddings = text_encoder( input_ids=uncond_input_ids.astype(np.float32) ) negative_text_embeddings = negative_embeddings[hidden_state_key] negative_prompt_embeds_list.append(negative_text_embeddings) # We are only ALWAYS interested in the pooled output of the final text encoder if self.xl: negative_pooled_prompt_embeds = negative_embeddings['pooled_outputs'] negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) if do_classifier_free_guidance: # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes prompt_embeds = np.concatenate( [negative_prompt_embeds, prompt_embeds]) if self.xl: pooled_prompt_embeds = np.concatenate( [negative_pooled_prompt_embeds, pooled_prompt_embeds]) prompt_embeddings = prompt_embeds.transpose(0, 2, 1)[:, :, None, :] if self.xl: return prompt_embeddings, pooled_prompt_embeds else: return prompt_embeddings, None def run_controlnet(self, sample, timestep, encoder_hidden_states, controlnet_cond, output_dtype=np.float16): if not self.controlnet: raise ValueError( "Conditions for controlnet are given but the pipeline has no controlnet modules") for i, (module, cond) in enumerate(zip(self.controlnet, controlnet_cond)): module_outputs = module( sample=sample.astype(np.float16), timestep=timestep.astype(np.float16), encoder_hidden_states=encoder_hidden_states.astype(np.float16), controlnet_cond=cond.astype(np.float16), ) if i == 0: outputs = module_outputs else: for key in outputs.keys(): outputs[key] += module_outputs[key] outputs = {k: v.astype(output_dtype) for k, v in outputs.items()} return outputs def run_safety_checker(self, image): if self.safety_checker is not None: safety_checker_input = self.feature_extractor( self.numpy_to_pil(image), return_tensors="np", ) safety_checker_outputs = self.safety_checker( clip_input=safety_checker_input.pixel_values.astype( np.float16), images=image.astype(np.float16), adjustment=np.array([0.]).astype( np.float16), # defaults to 0 in original pipeline ) # Unpack dict has_nsfw_concept = safety_checker_outputs["has_nsfw_concepts"] image = safety_checker_outputs["filtered_images"] concept_scores = safety_checker_outputs["concept_scores"] logger.info( f"Generated image has nsfw concept={has_nsfw_concept.any()}") else: has_nsfw_concept = None return image, has_nsfw_concept def decode_latents(self, latents): latents = 1 / 0.18215 * latents dtype = self.vae_decoder.expected_inputs['z']['dtype'] image = self.vae_decoder(z=latents.astype(dtype))["image"] image = np.clip(image / 2 + 0.5, 0, 1) image = image.transpose((0, 2, 3, 1)) return image def prepare_latents(self, batch_size, num_channels_latents, height, width, latents=None): latents_shape = (batch_size, num_channels_latents, self.height // 8, self.width // 8) if latents is None: latents = np.random.randn(*latents_shape).astype(np.float16) elif latents.shape != latents_shape: raise ValueError( f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" ) init_noise = self.scheduler.init_noise_sigma if isinstance(init_noise, torch.Tensor): init_noise = init_noise.numpy() latents = latents * init_noise return latents def prepare_control_cond(self, controlnet_cond, do_classifier_free_guidance, batch_size, num_images_per_prompt): processed_cond_list = [] for cond in controlnet_cond: cond = np.stack([cond] * batch_size * num_images_per_prompt) if do_classifier_free_guidance: cond = np.concatenate([cond] * 2) processed_cond_list.append(cond) return processed_cond_list def check_inputs(self, prompt, height, width, callback_steps): if height != self.height or width != self.width: logger.warning( "`height` and `width` dimensions (of the output image tensor) are fixed when exporting the Core ML models " \ "unless flexible shapes are used during export (https://coremltools.readme.io/docs/flexible-inputs). " \ "This pipeline was provided with Core ML models that generate {self.height}x{self.width} images (user requested {height}x{width})" ) if not isinstance(prompt, str) and not isinstance(prompt, list): raise ValueError( f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" ) if height % 8 != 0 or width % 8 != 0: raise ValueError( f"`height` and `width` have to be divisible by 8 but are {height} and {width}." ) if (callback_steps is None) or (callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}.") def prepare_extra_step_kwargs(self, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] accepts_eta = "eta" in set( inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta return extra_step_kwargs def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype): add_time_ids = list(original_size + crops_coords_top_left + target_size) add_time_ids = np.array(add_time_ids).astype(dtype) return add_time_ids def __call__( self, prompt, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, negative_prompt=None, num_images_per_prompt=1, eta=0.0, latents=None, output_type="pil", return_dict=True, callback=None, callback_steps=1, controlnet_cond=None, original_size: Optional[Tuple[int, int]]=None, crops_coords_top_left: Tuple[int, int]=(0, 0), target_size: Optional[Tuple[int, int]]=None, unet_batch_one=False, **kwargs, ): # 1. Check inputs. Raise error if not correct self.check_inputs(prompt, height, width, callback_steps) height = height or self.height width = width or self.width original_size = original_size or (height, width) target_size = target_size or (height, width) # 2. Define call parameters batch_size = 1 if isinstance(prompt, str) else len(prompt) if batch_size > 1 or num_images_per_prompt > 1: raise NotImplementedError( "For batched generation of multiple images and/or multiple prompts, please refer to the Swift package." ) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt text_embeddings, pooled_prompt_embeds = self._encode_prompt( prompt=prompt, prompt_2=None, do_classifier_free_guidance=do_classifier_free_guidance, negative_prompt=negative_prompt, negative_prompt_2=None ) # 4. Prepare XL kwargs if needed unet_additional_kwargs = {} # we add pooled prompt embeds + time_ids to unet kwargs if self.xl: add_text_embeds = pooled_prompt_embeds add_time_ids = self._get_add_time_ids(original_size, crops_coords_top_left, target_size, text_embeddings.dtype) if do_classifier_free_guidance: # TODO: This checks if the time_ids input is looking for time_ids.shape == (12,) or (2, 6) # Remove once model input shapes are ubiquitous if len(self.unet.expected_inputs['time_ids']['shape']) > 1: add_time_ids = [add_time_ids] add_time_ids = np.concatenate([add_time_ids, add_time_ids]) unet_additional_kwargs.update({'text_embeds': add_text_embeds.astype(np.float16), 'time_ids': add_time_ids.astype(np.float16)}) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps # 6. Prepare latent variables and controlnet cond num_channels_latents = self.unet.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, height, width, latents, ) if controlnet_cond: controlnet_cond = self.prepare_control_cond( controlnet_cond, do_classifier_free_guidance, batch_size, num_images_per_prompt, ) # 7. Prepare extra step kwargs extra_step_kwargs = self.prepare_extra_step_kwargs(eta) # 8. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance latent_model_input = np.concatenate( [latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input( latent_model_input, t) if isinstance(latent_model_input, torch.Tensor): latent_model_input = latent_model_input.numpy() if do_classifier_free_guidance: timestep = np.array([t, t], np.float16) else: timestep = np.array([t,], np.float16) # controlnet if controlnet_cond: control_net_additional_residuals = self.run_controlnet( sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings, controlnet_cond=controlnet_cond, ) else: control_net_additional_residuals = {} # predict the noise residual unet_additional_kwargs.update(control_net_additional_residuals) # get prediction from unet if not (unet_batch_one and do_classifier_free_guidance): noise_pred = self.unet( sample=latent_model_input.astype(np.float16), timestep=timestep, encoder_hidden_states=text_embeddings.astype(np.float16), **unet_additional_kwargs, )["noise_pred"] if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) else: # query unet sequentially latent_model_input = latent_model_input.astype(np.float16) text_embeddings = text_embeddings.astype(np.float16) timestep = np.array([t,], np.float16) noise_pred_uncond = self.unet( sample=np.expand_dims(latent_model_input[0], axis=0), timestep=timestep, encoder_hidden_states=np.expand_dims(text_embeddings[0], axis=0), **unet_additional_kwargs, )["noise_pred"] noise_pred_text = self.unet( sample=np.expand_dims(latent_model_input[1], axis=0), timestep=timestep, encoder_hidden_states=np.expand_dims(text_embeddings[1], axis=0), **unet_additional_kwargs, )["noise_pred"] # perform guidance if do_classifier_free_guidance: noise_pred = noise_pred_uncond + guidance_scale * ( noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs, ).prev_sample.numpy() # call the callback, if provided if callback is not None and i % callback_steps == 0: callback(i, t, latents) # 8. Post-processing image = self.decode_latents(latents) # 9. Run safety checker image, has_nsfw_concept = self.run_safety_checker(image) # 10. Convert to PIL if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: return (image, has_nsfw_concept) return StableDiffusionPipelineOutput( images=image, nsfw_content_detected=has_nsfw_concept) def get_available_schedulers(): schedulers = {} for scheduler in [DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler]: schedulers[scheduler().__class__.__name__.replace("Scheduler", "")] = scheduler return schedulers SCHEDULER_MAP = get_available_schedulers() def get_coreml_pipe(pytorch_pipe, mlpackages_dir, model_version, compute_unit, delete_original_pipe=True, scheduler_override=None, controlnet_models=None, force_zeros_for_empty_prompt=True, sources=None): """ Initializes and returns a `CoreMLStableDiffusionPipeline` from an original diffusers PyTorch pipeline sources: 'packages' or 'compiled' forces creation of model from specified sources. sources must be in mlpackages_dir """ # Ensure `scheduler_override` object is of correct type if specified if scheduler_override is not None: assert isinstance(scheduler_override, SchedulerMixin) logger.warning( "Overriding scheduler in pipeline: " f"Default={pytorch_pipe.scheduler}, Override={scheduler_override}") # Gather configured tokenizer and scheduler attributes from the original pipe if 'xl' in model_version: coreml_pipe_kwargs = { "tokenizer": pytorch_pipe.tokenizer, 'tokenizer_2': pytorch_pipe.tokenizer_2, "scheduler": pytorch_pipe.scheduler if scheduler_override is None else scheduler_override, 'xl': True, } model_packages_to_load = ["text_encoder", "text_encoder_2", "unet", "vae_decoder"] else: coreml_pipe_kwargs = { "tokenizer": pytorch_pipe.tokenizer, "scheduler": pytorch_pipe.scheduler if scheduler_override is None else scheduler_override, "feature_extractor": pytorch_pipe.feature_extractor, } model_packages_to_load = ["text_encoder", "unet", "vae_decoder"] coreml_pipe_kwargs["force_zeros_for_empty_prompt"] = force_zeros_for_empty_prompt if getattr(pytorch_pipe, "safety_checker", None) is not None: model_packages_to_load.append("safety_checker") else: logger.warning( f"Original diffusers pipeline for {model_version} does not have a safety_checker, " "Core ML pipeline will mirror this behavior.") coreml_pipe_kwargs["safety_checker"] = None if delete_original_pipe: del pytorch_pipe gc.collect() logger.info("Removed PyTorch pipe to reduce peak memory consumption") if controlnet_models: model_packages_to_load.remove("unet") coreml_pipe_kwargs["unet"] = _load_mlpackage( submodule_name="control-unet", mlpackages_dir=mlpackages_dir, model_version=model_version, compute_unit=compute_unit, ) coreml_pipe_kwargs["controlnet"] = [_load_mlpackage_controlnet( mlpackages_dir, model_version, compute_unit, ) for model_version in controlnet_models] else: coreml_pipe_kwargs["controlnet"] = None # Load Core ML models logger.info(f"Loading Core ML models in memory from {mlpackages_dir}") coreml_pipe_kwargs.update({ model_name: _load_mlpackage( submodule_name=model_name, mlpackages_dir=mlpackages_dir, model_version=model_version, compute_unit=compute_unit, sources=sources, ) for model_name in model_packages_to_load }) logger.info("Done.") logger.info("Initializing Core ML pipe for image generation") coreml_pipe = CoreMLStableDiffusionPipeline(**coreml_pipe_kwargs) logger.info("Done.") return coreml_pipe def get_image_path(args, **override_kwargs): """ mkdir output folder and encode metadata in the filename """ out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" "))) os.makedirs(out_folder, exist_ok=True) out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}" out_fname += f"_computeUnit_{override_kwargs.get('compute_unit', None) or args.compute_unit}" out_fname += f"_modelVersion_{override_kwargs.get('model_version', None) or args.model_version.replace('/', '_')}" if args.scheduler is not None: out_fname += f"_customScheduler_{override_kwargs.get('scheduler', None) or args.scheduler}" out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}" return os.path.join(out_folder, out_fname + ".png") def prepare_controlnet_cond(image_path, height, width): image = Image.open(image_path).convert("RGB") image = image.resize((height, width), resample=Image.LANCZOS) image = np.array(image).transpose(2, 0, 1) / 255.0 return image def main(args): logger.info(f"Setting random seed to {args.seed}") np.random.seed(args.seed) logger.info("Initializing PyTorch pipe for reference configuration") SDP = StableDiffusionXLPipeline if 'xl' in args.model_version else StableDiffusionPipeline pytorch_pipe = SDP.from_pretrained( args.model_version, use_auth_token=True, ) # Get Scheduler user_specified_scheduler = None if args.scheduler is not None: user_specified_scheduler = SCHEDULER_MAP[ args.scheduler].from_config(pytorch_pipe.scheduler.config) # Get Force Zeros Config if it exists force_zeros_for_empty_prompt: bool = False if 'xl' in args.model_version and 'force_zeros_for_empty_prompt' in pytorch_pipe.config: force_zeros_for_empty_prompt = pytorch_pipe.config['force_zeros_for_empty_prompt'] coreml_pipe = get_coreml_pipe( pytorch_pipe=pytorch_pipe, mlpackages_dir=args.i, model_version=args.model_version, compute_unit=args.compute_unit, scheduler_override=user_specified_scheduler, controlnet_models=args.controlnet, force_zeros_for_empty_prompt=force_zeros_for_empty_prompt, sources=args.model_sources, ) if args.controlnet: controlnet_cond = [] for i, _ in enumerate(args.controlnet): image_path = args.controlnet_inputs[i] image = prepare_controlnet_cond(image_path, coreml_pipe.height, coreml_pipe.width) controlnet_cond.append(image) else: controlnet_cond = None logger.info("Beginning image generation.") image = coreml_pipe( prompt=args.prompt, height=coreml_pipe.height, width=coreml_pipe.width, num_inference_steps=args.num_inference_steps, guidance_scale=args.guidance_scale, controlnet_cond=controlnet_cond, negative_prompt=args.negative_prompt, unet_batch_one=args.unet_batch_one, ) out_path = get_image_path(args) logger.info(f"Saving generated image to {out_path}") image["images"][0].save(out_path) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--prompt", required=True, help="The text prompt to be used for text-to-image generation.") parser.add_argument( "-i", required=True, help=("Path to input directory with the .mlpackage files generated by " "python_coreml_stable_diffusion.torch2coreml")) parser.add_argument("-o", required=True) parser.add_argument("--seed", "-s", default=93, type=int, help="Random seed to be able to reproduce results") parser.add_argument( "--model-version", default="CompVis/stable-diffusion-v1-4", help= ("The pre-trained model checkpoint and configuration to restore. " "For available versions: https://huggingface.co/models?search=stable-diffusion" )) parser.add_argument( "--compute-unit", choices=get_available_compute_units(), default="ALL", help=("The compute units to be used when executing Core ML models. " f"Options: {get_available_compute_units()}")) parser.add_argument( "--scheduler", choices=tuple(SCHEDULER_MAP.keys()), default=None, help=("The scheduler to use for running the reverse diffusion process. " "If not specified, the default scheduler from the diffusers pipeline is utilized")) parser.add_argument( "--num-inference-steps", default=50, type=int, help="The number of iterations the unet model will be executed throughout the reverse diffusion process") parser.add_argument( "--guidance-scale", default=7.5, type=float, help="Controls the influence of the text prompt on sampling process (0=random images)") parser.add_argument( "--controlnet", nargs="*", type=str, help=("Enables ControlNet and use control-unet instead of unet for additional inputs. " "For Multi-Controlnet, provide the model names separated by spaces.")) parser.add_argument( "--controlnet-inputs", nargs="*", type=str, help=("Image paths for ControlNet inputs. " "Please enter images corresponding to each controlnet provided at --controlnet option in same order.")) parser.add_argument( "--negative-prompt", default=None, help="The negative text prompt to be used for text-to-image generation.") parser.add_argument( "--unet-batch-one", action="store_true", help="Do not batch unet predictions for the prompt and negative prompt.") parser.add_argument('--model-sources', default=None, choices=['packages', 'compiled'], help='Force build from `packages` or `compiled`') args = parser.parse_args() main(args) ================================================ FILE: python_coreml_stable_diffusion/torch2coreml.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # from python_coreml_stable_diffusion import ( unet, controlnet, chunk_mlprogram ) import argparse from collections import OrderedDict, defaultdict from copy import deepcopy import coremltools as ct from diffusers import ( StableDiffusionPipeline, DiffusionPipeline, ControlNetModel ) from diffusionkit.tests.torch2coreml import ( convert_mmdit_to_mlpackage, convert_vae_to_mlpackage ) import gc from huggingface_hub import snapshot_download import logging logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) import numpy as np import os import requests import shutil import time import re import pathlib import torch import torch.nn as nn import torch.nn.functional as F torch.set_grad_enabled(False) from types import MethodType def _get_coreml_inputs(sample_inputs, args): return [ ct.TensorType( name=k, shape=v.shape, dtype=v.numpy().dtype if isinstance(v, torch.Tensor) else v.dtype, ) for k, v in sample_inputs.items() ] def compute_psnr(a, b): """ Compute Peak-Signal-to-Noise-Ratio across two numpy.ndarray objects """ max_b = np.abs(b).max() sumdeltasq = 0.0 sumdeltasq = ((a - b) * (a - b)).sum() sumdeltasq /= b.size sumdeltasq = np.sqrt(sumdeltasq) eps = 1e-5 eps2 = 1e-10 psnr = 20 * np.log10((max_b + eps) / (sumdeltasq + eps2)) return psnr ABSOLUTE_MIN_PSNR = 35 def report_correctness(original_outputs, final_outputs, log_prefix): """ Report PSNR values across two compatible tensors """ original_psnr = compute_psnr(original_outputs, original_outputs) final_psnr = compute_psnr(original_outputs, final_outputs) dB_change = final_psnr - original_psnr logger.info( f"{log_prefix}: PSNR changed by {dB_change:.1f} dB ({original_psnr:.1f} -> {final_psnr:.1f})" ) if final_psnr < ABSOLUTE_MIN_PSNR: raise ValueError(f"{final_psnr:.1f} dB is too low!") else: logger.info( f"{final_psnr:.1f} dB > {ABSOLUTE_MIN_PSNR} dB (minimum allowed) parity check passed" ) return final_psnr def _get_out_path(args, submodule_name): fname = f"Stable_Diffusion_version_{args.model_version}_{submodule_name}.mlpackage" fname = fname.replace("/", "_") return os.path.join(args.o, fname) def _convert_to_coreml(submodule_name, torchscript_module, sample_inputs, output_names, args, out_path=None, precision=None, compute_unit=None): if out_path is None: out_path = _get_out_path(args, submodule_name) compute_unit = compute_unit or ct.ComputeUnit[args.compute_unit] if os.path.exists(out_path): logger.info(f"Skipping export because {out_path} already exists") logger.info(f"Loading model from {out_path}") start = time.time() # Note: Note that each model load will trigger a model compilation which takes up to a few minutes. # The Swifty CLI we provide uses precompiled Core ML models (.mlmodelc) which incurs compilation only # upon first load and mitigates the load time in subsequent runs. coreml_model = ct.models.MLModel( out_path, compute_units=compute_unit) logger.info( f"Loading {out_path} took {time.time() - start:.1f} seconds") coreml_model.compute_unit = compute_unit else: logger.info(f"Converting {submodule_name} to CoreML..") deployment_target = _get_deployment_target(args.min_deployment_target) coreml_model = ct.convert( torchscript_module, convert_to="mlprogram", minimum_deployment_target=deployment_target, inputs=_get_coreml_inputs(sample_inputs, args), outputs=[ct.TensorType(name=name, dtype=np.float32) for name in output_names], compute_units=compute_unit, compute_precision=precision, skip_model_load=not args.check_output_correctness, ) del torchscript_module gc.collect() return coreml_model, out_path def _get_deployment_target(target_string): """ Convert deployment target string to coremltools target object. Args: target_string (str): Target deployment string (e.g., "macOS13", "iOS18") Returns: coremltools target object """ target_map = { "macOS13": ct.target.macOS13, "macOS14": ct.target.macOS14, "iOS16": ct.target.iOS16, "iOS17": ct.target.iOS17, } # Handle newer targets that might not be available in older coremltools versions try: if target_string == "macOS15": return ct.target.macOS15 elif target_string == "iOS18": return ct.target.iOS18 except AttributeError: logger.warning(f"Deployment target {target_string} not available in this coremltools version. " f"Using macOS14 as fallback.") return ct.target.macOS14 if target_string in target_map: return target_map[target_string] else: logger.warning(f"Unknown deployment target {target_string}. Using macOS13 as fallback.") return ct.target.macOS13 def quantize_weights(args): """ Quantize weights to args.quantize_nbits using a palette (look-up table) """ for model_name in ["text_encoder", "text_encoder_2", "unet", "refiner", "control-unet"]: logger.info(f"Quantizing {model_name} to {args.quantize_nbits}-bit precision") out_path = _get_out_path(args, model_name) _quantize_weights( out_path, model_name, args.quantize_nbits ) if args.convert_controlnet: for controlnet_model_version in args.convert_controlnet: controlnet_model_name = controlnet_model_version.replace("/", "_") logger.info(f"Quantizing {controlnet_model_name} to {args.quantize_nbits}-bit precision") fname = f"ControlNet_{controlnet_model_name}.mlpackage" out_path = os.path.join(args.o, fname) _quantize_weights( out_path, controlnet_model_name, args.quantize_nbits ) def _quantize_weights(out_path, model_name, nbits): if os.path.exists(out_path): logger.info(f"Quantizing {model_name}") mlmodel = ct.models.MLModel(out_path, compute_units=ct.ComputeUnit.CPU_ONLY) op_config = ct.optimize.coreml.OpPalettizerConfig( mode="kmeans", nbits=nbits, ) config = ct.optimize.coreml.OptimizationConfig( global_config=op_config, op_type_configs={ "gather": None # avoid quantizing the embedding table } ) model = ct.optimize.coreml.palettize_weights(mlmodel, config=config).save(out_path) logger.info("Done") else: logger.info( f"Skipped quantizing {model_name} (Not found at {out_path})") def _compile_coreml_model(source_model_path, output_dir, final_name): """ Compiles Core ML models using the coremlcompiler utility from Xcode toolchain """ target_path = os.path.join(output_dir, f"{final_name}.mlmodelc") if os.path.exists(target_path): logger.warning( f"Found existing compiled model at {target_path}! Skipping..") return target_path logger.info(f"Compiling {source_model_path}") source_model_name = os.path.basename( os.path.splitext(source_model_path)[0]) os.system(f"xcrun coremlcompiler compile {source_model_path} {output_dir}") compiled_output = os.path.join(output_dir, f"{source_model_name}.mlmodelc") shutil.move(compiled_output, target_path) return target_path def _download_t5_model(args, t5_save_path): t5_url = args.text_encoder_t5_url match = re.match(r'https://huggingface.co/(.+)/resolve/main/(.+)', t5_url) if not match: raise ValueError(f"Invalid Hugging Face URL: {t5_url}") repo_id, model_subpath = match.groups() download_path = snapshot_download( repo_id=repo_id, revision="main", allow_patterns=[f"{model_subpath}/*"] ) logger.info(f"Downloaded T5 model to {download_path}") # Move the downloaded model to the top level of the Resources directory logger.info(f"Copying T5 model from {download_path} to {t5_save_path}") cache_path = os.path.join(download_path, model_subpath) shutil.copytree(cache_path, t5_save_path) def bundle_resources_for_swift_cli(args): """ - Compiles Core ML models from mlpackage into mlmodelc format - Download tokenizer resources for the text encoder """ resources_dir = os.path.join(args.o, "Resources") if not os.path.exists(resources_dir): os.makedirs(resources_dir, exist_ok=True) logger.info(f"Created {resources_dir} for Swift CLI assets") # Compile model using coremlcompiler (Significantly reduces the load time for unet) for source_name, target_name in [("text_encoder", "TextEncoder"), ("text_encoder_2", "TextEncoder2"), ("vae_decoder", "VAEDecoder"), ("vae_encoder", "VAEEncoder"), ("unet", "Unet"), ("unet_chunk1", "UnetChunk1"), ("unet_chunk2", "UnetChunk2"), ("refiner", "UnetRefiner"), ("refiner_chunk1", "UnetRefinerChunk1"), ("refiner_chunk2", "UnetRefinerChunk2"), ("mmdit", "MultiModalDiffusionTransformer"), ("control-unet", "ControlledUnet"), ("control-unet_chunk1", "ControlledUnetChunk1"), ("control-unet_chunk2", "ControlledUnetChunk2"), ("safety_checker", "SafetyChecker")]: source_path = _get_out_path(args, source_name) if os.path.exists(source_path): target_path = _compile_coreml_model(source_path, resources_dir, target_name) logger.info(f"Compiled {source_path} to {target_path}") else: logger.warning( f"{source_path} not found, skipping compilation to {target_name}.mlmodelc" ) if args.convert_controlnet: for controlnet_model_version in args.convert_controlnet: controlnet_model_name = controlnet_model_version.replace("/", "_") fname = f"ControlNet_{controlnet_model_name}.mlpackage" source_path = os.path.join(args.o, fname) controlnet_dir = os.path.join(resources_dir, "controlnet") target_name = "".join([word.title() for word in re.split('_|-', controlnet_model_name)]) if os.path.exists(source_path): target_path = _compile_coreml_model(source_path, controlnet_dir, target_name) logger.info(f"Compiled {source_path} to {target_path}") else: logger.warning( f"{source_path} not found, skipping compilation to {target_name}.mlmodelc" ) # Fetch and save vocabulary JSON file for text tokenizer logger.info("Downloading and saving tokenizer vocab.json") with open(os.path.join(resources_dir, "vocab.json"), "wb") as f: f.write(requests.get(args.text_encoder_vocabulary_url).content) logger.info("Done") # Fetch and save merged pairs JSON file for text tokenizer logger.info("Downloading and saving tokenizer merges.txt") with open(os.path.join(resources_dir, "merges.txt"), "wb") as f: f.write(requests.get(args.text_encoder_merges_url).content) logger.info("Done") # Fetch and save pre-converted T5 text encoder model t5_model_name = "TextEncoderT5.mlmodelc" t5_save_path = os.path.join(resources_dir, t5_model_name) if args.include_t5: if not os.path.exists(t5_save_path): logger.info("Downloading pre-converted T5 encoder model TextEncoderT5.mlmodelc") _download_t5_model(args, t5_save_path) logger.info("Done") else: logger.info(f"Skipping T5 download as {t5_save_path} already exists") # Fetch and save T5 text tokenizer JSON files logger.info("Downloading and saving T5 tokenizer files tokenizer_config.json and tokenizer.json") with open(os.path.join(resources_dir, "tokenizer_config.json"), "wb") as f: f.write(requests.get(args.text_encoder_t5_config_url).content) with open(os.path.join(resources_dir, "tokenizer.json"), "wb") as f: f.write(requests.get(args.text_encoder_t5_data_url).content) logger.info("Done") return resources_dir from transformers.models.clip import modeling_clip # Copied from https://github.com/huggingface/transformers/blob/v4.30.0/src/transformers/models/clip/modeling_clip.py#L677C1-L692C1 # Starting from transformers >= 4.35.0, the _make_causal_mask function is replaced by _create_4d_causal_attention_mask in modeling_clip. # For backward compatibility with versions < 4.35.0, both functions are patched here. def patched_make_causal_mask(input_ids_shape, dtype, device, past_key_values_length: int = 0): """ Patch to replace torch.finfo(dtype).min with -1e4 """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.tensor(-1e4, device=device), device=device) mask_cond = torch.arange(mask.size(-1), device=device) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) modeling_clip._make_causal_mask = patched_make_causal_mask # For transformers >= 4.30.0 and transformers < 4.35.0 modeling_clip._create_4d_causal_attention_mask = patched_make_causal_mask # For transformers >= 4.35.0 def convert_text_encoder(text_encoder, tokenizer, submodule_name, args): """ Converts the text encoder component of Stable Diffusion """ text_encoder = text_encoder.to(dtype=torch.float32) out_path = _get_out_path(args, submodule_name) if os.path.exists(out_path): logger.info( f"`{submodule_name}` already exists at {out_path}, skipping conversion." ) return # Create sample inputs for tracing, conversion and correctness verification text_encoder_sequence_length = tokenizer.model_max_length sample_text_encoder_inputs = { "input_ids": torch.randint( text_encoder.config.vocab_size, (1, text_encoder_sequence_length), # https://github.com/apple/coremltools/issues/1423 dtype=torch.float32, ) } sample_text_encoder_inputs_spec = { k: (v.shape, v.dtype) for k, v in sample_text_encoder_inputs.items() } logger.info(f"Sample inputs spec: {sample_text_encoder_inputs_spec}") class TextEncoder(nn.Module): def __init__(self, with_hidden_states_for_layer=None): super().__init__() self.text_encoder = text_encoder self.with_hidden_states_for_layer = with_hidden_states_for_layer def forward(self, input_ids): if self.with_hidden_states_for_layer is not None: output = self.text_encoder(input_ids, output_hidden_states=True) hidden_embeds = output.hidden_states[self.with_hidden_states_for_layer] if "text_embeds" in output: return (hidden_embeds, output.text_embeds) else: return (hidden_embeds, output.pooler_output) else: return self.text_encoder(input_ids, return_dict=False) # SD XL uses the hidden states after the encoder layers from both encoders, # and the pooled `text_embeds` output of the second encoder. hidden_layer = -2 if args.xl_version else None reference_text_encoder = TextEncoder(with_hidden_states_for_layer=hidden_layer).eval() logger.info(f"JIT tracing {submodule_name}..") reference_text_encoder = torch.jit.trace( reference_text_encoder, (sample_text_encoder_inputs["input_ids"].to(torch.int32), ), ) logger.info("Done.") if args.xl_version: output_names = ["hidden_embeds", "pooled_outputs"] else: output_names = ["last_hidden_state", "pooled_outputs"] coreml_text_encoder, out_path = _convert_to_coreml( submodule_name, reference_text_encoder, sample_text_encoder_inputs, output_names, args) # Set model metadata coreml_text_encoder.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" if args.xl_version: coreml_text_encoder.license = "OpenRAIL++-M (https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/LICENSE.md)" else: coreml_text_encoder.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)" coreml_text_encoder.version = args.model_version coreml_text_encoder.short_description = \ "Stable Diffusion generates images conditioned on text and/or other images as input through the diffusion process. " \ "Please refer to https://arxiv.org/abs/2112.10752 for details." # Set the input descriptions coreml_text_encoder.input_description[ "input_ids"] = "The token ids that represent the input text" # Set the output descriptions if args.xl_version: coreml_text_encoder.output_description[ "hidden_embeds"] = "Hidden states after the encoder layers" else: coreml_text_encoder.output_description[ "last_hidden_state"] = "The token embeddings as encoded by the Transformer model" coreml_text_encoder.output_description[ "pooled_outputs"] = "The version of the `last_hidden_state` output after pooling" coreml_text_encoder.save(out_path) logger.info(f"Saved text_encoder into {out_path}") # Parity check PyTorch vs CoreML if args.check_output_correctness: baseline_out = text_encoder( sample_text_encoder_inputs["input_ids"].to(torch.int32), output_hidden_states=args.xl_version, return_dict=True, ) if args.xl_version: # TODO: maybe check pooled_outputs too baseline_out = baseline_out.hidden_states[hidden_layer].numpy() else: baseline_out = baseline_out.last_hidden_state.numpy() coreml_out = coreml_text_encoder.predict( {k: v.numpy() for k, v in sample_text_encoder_inputs.items()} ) coreml_out = coreml_out["hidden_embeds" if args.xl_version else "last_hidden_state"] report_correctness( baseline_out, coreml_out, "text_encoder baseline PyTorch to reference CoreML") del reference_text_encoder, coreml_text_encoder gc.collect() def modify_coremltools_torch_frontend_badbmm(): """ Modifies coremltools torch frontend for baddbmm to be robust to the `beta` argument being of non-float dtype: e.g. https://github.com/huggingface/diffusers/blob/v0.8.1/src/diffusers/models/attention.py#L315 """ from coremltools.converters.mil import register_torch_op from coremltools.converters.mil.mil import Builder as mb from coremltools.converters.mil.frontend.torch.ops import _get_inputs from coremltools.converters.mil.frontend.torch.torch_op_registry import _TORCH_OPS_REGISTRY if "baddbmm" in _TORCH_OPS_REGISTRY: del _TORCH_OPS_REGISTRY["baddbmm"] @register_torch_op def baddbmm(context, node): """ baddbmm(Tensor input, Tensor batch1, Tensor batch2, Scalar beta=1, Scalar alpha=1) output = beta * input + alpha * batch1 * batch2 Notice that batch1 and batch2 must be 3-D tensors each containing the same number of matrices. If batch1 is a (b×n×m) tensor, batch2 is a (b×m×p) tensor, then input must be broadcastable with a (b×n×p) tensor and out will be a (b×n×p) tensor. """ assert len(node.outputs) == 1 inputs = _get_inputs(context, node, expected=5) bias, batch1, batch2, beta, alpha = inputs if beta.val != 1.0: # Apply scaling factor beta to the bias. if beta.val.dtype == np.int32: beta = mb.cast(x=beta, dtype="fp32") logger.warning( f"Casted the `beta`(value={beta.val}) argument of `baddbmm` op " "from int32 to float32 dtype for conversion!") bias = mb.mul(x=beta, y=bias, name=bias.name + "_scaled") context.add(bias) if alpha.val != 1.0: # Apply scaling factor alpha to the input. batch1 = mb.mul(x=alpha, y=batch1, name=batch1.name + "_scaled") context.add(batch1) bmm_node = mb.matmul(x=batch1, y=batch2, name=node.name + "_bmm") context.add(bmm_node) baddbmm_node = mb.add(x=bias, y=bmm_node, name=node.name) context.add(baddbmm_node) def convert_vae_decoder(pipe, args): """ Converts the VAE Decoder component of Stable Diffusion """ out_path = _get_out_path(args, "vae_decoder") if os.path.exists(out_path): logger.info( f"`vae_decoder` already exists at {out_path}, skipping conversion." ) return if not hasattr(pipe, "unet"): raise RuntimeError( "convert_unet() deletes pipe.unet to save RAM. " "Please use convert_vae_decoder() before convert_unet()") z_shape = ( 1, # B pipe.vae.config.latent_channels, # C args.latent_h or pipe.unet.config.sample_size, # H args.latent_w or pipe.unet.config.sample_size, # W ) if args.custom_vae_version is None and args.xl_version: inputs_dtype = torch.float32 compute_precision = ct.precision.FLOAT32 # FIXME: Hardcoding to CPU_AND_GPU since ANE doesn't support FLOAT32 compute_unit = ct.ComputeUnit.CPU_AND_GPU else: inputs_dtype = torch.float16 compute_precision = None compute_unit = None sample_vae_decoder_inputs = { "z": torch.rand(*z_shape, dtype=inputs_dtype) } class VAEDecoder(nn.Module): """ Wrapper nn.Module wrapper for pipe.decode() method """ def __init__(self): super().__init__() self.post_quant_conv = pipe.vae.post_quant_conv.to(dtype=torch.float32) self.decoder = pipe.vae.decoder.to(dtype=torch.float32) def forward(self, z): return self.decoder(self.post_quant_conv(z)) baseline_decoder = VAEDecoder().eval() # No optimization needed for the VAE Decoder as it is a pure ConvNet traced_vae_decoder = torch.jit.trace( baseline_decoder, (sample_vae_decoder_inputs["z"].to(torch.float32), )) modify_coremltools_torch_frontend_badbmm() coreml_vae_decoder, out_path = _convert_to_coreml( "vae_decoder", traced_vae_decoder, sample_vae_decoder_inputs, ["image"], args, precision=compute_precision, compute_unit=compute_unit) # Set model metadata coreml_vae_decoder.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" if args.xl_version: coreml_vae_decoder.license = "OpenRAIL++-M (https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/LICENSE.md)" else: coreml_vae_decoder.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)" coreml_vae_decoder.version = args.model_version coreml_vae_decoder.short_description = \ "Stable Diffusion generates images conditioned on text and/or other images as input through the diffusion process. " \ "Please refer to https://arxiv.org/abs/2112.10752 for details." # Set the input descriptions coreml_vae_decoder.input_description["z"] = \ "The denoised latent embeddings from the unet model after the last step of reverse diffusion" # Set the output descriptions coreml_vae_decoder.output_description[ "image"] = "Generated image normalized to range [-1, 1]" coreml_vae_decoder.save(out_path) logger.info(f"Saved vae_decoder into {out_path}") # Parity check PyTorch vs CoreML if args.check_output_correctness: baseline_out = baseline_decoder( z=sample_vae_decoder_inputs["z"].to(torch.float32)).numpy() coreml_out = list( coreml_vae_decoder.predict( {k: v.numpy() for k, v in sample_vae_decoder_inputs.items()}).values())[0] report_correctness(baseline_out, coreml_out, "vae_decoder baseline PyTorch to baseline CoreML") del traced_vae_decoder, pipe.vae.decoder, coreml_vae_decoder gc.collect() def convert_vae_decoder_sd3(args): """ Converts the VAE component of Stable Diffusion 3 """ out_path = _get_out_path(args, "vae_decoder") if os.path.exists(out_path): logger.info( f"`vae_decoder` already exists at {out_path}, skipping conversion." ) return # Convert the VAE Decoder model via DiffusionKit converted_vae_path = convert_vae_to_mlpackage( model_version=args.model_version, latent_h=args.latent_h, latent_w=args.latent_w, output_dir=args.o, ) # Load converted model coreml_vae_decoder = ct.models.MLModel(converted_vae_path) # Set model metadata coreml_vae_decoder.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" coreml_vae_decoder.license = "Stability AI Community License (https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE.md)" coreml_vae_decoder.version = args.model_version coreml_vae_decodershort_description = \ "Stable Diffusion 3 generates images conditioned on text or other images as input through the diffusion process. " \ "Please refer to https://arxiv.org/pdf/2403.03206 for details." # Set the input descriptions coreml_vae_decoder.input_description["z"] = \ "The denoised latent embeddings from the unet model after the last step of reverse diffusion" # Set the output descriptions coreml_vae_decoder.output_description[ "image"] = "Generated image normalized to range [-1, 1]" # Set package version metadata from python_coreml_stable_diffusion._version import __version__ coreml_vae_decoder.user_defined_metadata["com.github.apple.ml-stable-diffusion.version"] = __version__ from diffusionkit.version import __version__ coreml_vae_decoder.user_defined_metadata["com.github.argmax.diffusionkit.version"] = __version__ # Save the updated model coreml_vae_decoder.save(out_path) logger.info(f"Saved vae_decoder into {out_path}") # Delete the original file if os.path.exists(converted_vae_path): shutil.rmtree(converted_vae_path) del coreml_vae_decoder gc.collect() def convert_vae_encoder(pipe, args): """ Converts the VAE Encoder component of Stable Diffusion """ out_path = _get_out_path(args, "vae_encoder") if os.path.exists(out_path): logger.info( f"`vae_encoder` already exists at {out_path}, skipping conversion." ) return if not hasattr(pipe, "unet"): raise RuntimeError( "convert_unet() deletes pipe.unet to save RAM. " "Please use convert_vae_encoder() before convert_unet()") height = (args.latent_h or pipe.unet.config.sample_size) * 8 width = (args.latent_w or pipe.unet.config.sample_size) * 8 x_shape = ( 1, # B 3, # C (RGB range from -1 to 1) height, # H width, # w ) if args.xl_version: inputs_dtype = torch.float32 compute_precision = ct.precision.FLOAT32 # FIXME: Hardcoding to CPU_AND_GPU since ANE doesn't support FLOAT32 compute_unit = ct.ComputeUnit.CPU_AND_GPU else: inputs_dtype = torch.float16 compute_precision = None compute_unit = None sample_vae_encoder_inputs = { "x": torch.rand(*x_shape, dtype=inputs_dtype) } class VAEEncoder(nn.Module): """ Wrapper nn.Module wrapper for pipe.encode() method """ def __init__(self): super().__init__() self.quant_conv = pipe.vae.quant_conv.to(dtype=torch.float32) self.encoder = pipe.vae.encoder.to(dtype=torch.float32) def forward(self, x): return self.quant_conv(self.encoder(x)) baseline_encoder = VAEEncoder().eval() # No optimization needed for the VAE Encoder as it is a pure ConvNet traced_vae_encoder = torch.jit.trace( baseline_encoder, (sample_vae_encoder_inputs["x"].to(torch.float32), )) modify_coremltools_torch_frontend_badbmm() coreml_vae_encoder, out_path = _convert_to_coreml( "vae_encoder", traced_vae_encoder, sample_vae_encoder_inputs, ["latent"], args, precision=compute_precision, compute_unit=compute_unit) # Set model metadata coreml_vae_encoder.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" if args.xl_version: coreml_vae_encoder.license = "OpenRAIL++-M (https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/LICENSE.md)" else: coreml_vae_encoder.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)" coreml_vae_encoder.version = args.model_version coreml_vae_encoder.short_description = \ "Stable Diffusion generates images conditioned on text and/or other images as input through the diffusion process. " \ "Please refer to https://arxiv.org/abs/2112.10752 for details." # Set the input descriptions coreml_vae_encoder.input_description["x"] = \ "The input image to base the initial latents on normalized to range [-1, 1]" # Set the output descriptions coreml_vae_encoder.output_description["latent"] = "The latent embeddings from the unet model from the input image." coreml_vae_encoder.save(out_path) logger.info(f"Saved vae_encoder into {out_path}") # Parity check PyTorch vs CoreML if args.check_output_correctness: baseline_out = baseline_encoder( x=sample_vae_encoder_inputs["x"].to(torch.float32)).numpy() coreml_out = list( coreml_vae_encoder.predict( {k: v.numpy() for k, v in sample_vae_encoder_inputs.items()}).values())[0] report_correctness(baseline_out, coreml_out, "vae_encoder baseline PyTorch to baseline CoreML") del traced_vae_encoder, pipe.vae.encoder, coreml_vae_encoder gc.collect() def convert_unet(pipe, args, model_name=None): """ Converts the UNet component of Stable Diffusion """ if args.unet_support_controlnet: unet_name = "control-unet" else: unet_name = model_name or "unet" out_path = _get_out_path(args, unet_name) # Check if Unet was previously exported and then chunked unet_chunks_exist = all( os.path.exists( out_path.replace(".mlpackage", f"_chunk{idx+1}.mlpackage")) for idx in range(2)) if args.chunk_unet and unet_chunks_exist: logger.info("`unet` chunks already exist, skipping conversion.") del pipe.unet gc.collect() return # If original Unet does not exist, export it from PyTorch+diffusers elif not os.path.exists(out_path): # Prepare sample input shapes and values batch_size = 2 # for classifier-free guidance if args.unet_batch_one: batch_size = 1 # for not using classifier-free guidance sample_shape = ( batch_size, # B pipe.unet.config.in_channels, # C args.latent_h or pipe.unet.config.sample_size, # H args.latent_w or pipe.unet.config.sample_size, # W ) if not hasattr(pipe, "text_encoder"): raise RuntimeError( "convert_text_encoder() deletes pipe.text_encoder to save RAM. " "Please use convert_unet() before convert_text_encoder()") if hasattr(pipe, "text_encoder") and pipe.text_encoder is not None: text_token_sequence_length = pipe.text_encoder.config.max_position_embeddings hidden_size = pipe.text_encoder.config.hidden_size, elif hasattr(pipe, "text_encoder_2") and pipe.text_encoder_2 is not None: text_token_sequence_length = pipe.text_encoder_2.config.max_position_embeddings hidden_size = pipe.text_encoder_2.config.hidden_size, encoder_hidden_states_shape = ( batch_size, args.text_encoder_hidden_size or pipe.unet.config.cross_attention_dim or hidden_size, 1, args.text_token_sequence_length or text_token_sequence_length, ) # Create the scheduled timesteps for downstream use DEFAULT_NUM_INFERENCE_STEPS = 50 pipe.scheduler.set_timesteps(DEFAULT_NUM_INFERENCE_STEPS) sample_unet_inputs = OrderedDict([ ("sample", torch.rand(*sample_shape)), ("timestep", torch.tensor([pipe.scheduler.timesteps[0].item()] * (batch_size)).to(torch.float32)), ("encoder_hidden_states", torch.rand(*encoder_hidden_states_shape)) ]) # Prepare inputs baseline_sample_unet_inputs = deepcopy(sample_unet_inputs) baseline_sample_unet_inputs[ "encoder_hidden_states"] = baseline_sample_unet_inputs[ "encoder_hidden_states"].squeeze(2).transpose(1, 2) # Initialize reference unet if args.xl_version: unet_cls = unet.UNet2DConditionModelXL # Sample time_ids height = (args.latent_h or pipe.unet.config.sample_size) * 8 width = (args.latent_w or pipe.unet.config.sample_size) * 8 original_size = (height, width) # output_resolution crops_coords_top_left = (0, 0) # topleft_crop_cond target_size = (height, width) # resolution_cond if hasattr(pipe.config, "requires_aesthetics_score") and pipe.config.requires_aesthetics_score: # Part of SDXL's micro-conditioning as explained in section 2.2 of # [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to # simulate an aesthetic score of the generated image by influencing the positive and negative text conditions. aesthetic_score = 6.0 # default aesthetic_score negative_aesthetic_score = 2.5 # default negative_aesthetic_score add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,)) else: add_time_ids = list(original_size + crops_coords_top_left + target_size) add_neg_time_ids = list(original_size + crops_coords_top_left + target_size) time_ids = [ add_neg_time_ids, add_time_ids ] # Pooled text embedding from text_encoder_2 text_embeds_shape = ( batch_size, pipe.text_encoder_2.config.hidden_size ) additional_xl_inputs = OrderedDict([ ("time_ids", torch.tensor(time_ids).to(torch.float32)), ("text_embeds", torch.rand(*text_embeds_shape)), ]) sample_unet_inputs.update(additional_xl_inputs) baseline_sample_unet_inputs['added_cond_kwargs'] = additional_xl_inputs else: unet_cls = unet.UNet2DConditionModel reference_unet = unet_cls(support_controlnet=args.unet_support_controlnet, **pipe.unet.config).eval() load_state_dict_summary = reference_unet.load_state_dict( pipe.unet.state_dict()) if args.unet_support_controlnet: from .unet import calculate_conv2d_output_shape additional_residuals_shapes = [] # conv_in out_h, out_w = calculate_conv2d_output_shape( (args.latent_h or pipe.unet.config.sample_size), (args.latent_w or pipe.unet.config.sample_size), reference_unet.conv_in, ) additional_residuals_shapes.append( (batch_size, reference_unet.conv_in.out_channels, out_h, out_w)) # down_blocks for down_block in reference_unet.down_blocks: additional_residuals_shapes += [ (batch_size, resnet.out_channels, out_h, out_w) for resnet in down_block.resnets ] if hasattr(down_block, "downsamplers") and down_block.downsamplers is not None: for downsampler in down_block.downsamplers: out_h, out_w = calculate_conv2d_output_shape(out_h, out_w, downsampler.conv) additional_residuals_shapes.append( (batch_size, down_block.downsamplers[-1].conv.out_channels, out_h, out_w)) # mid_block additional_residuals_shapes.append( (batch_size, reference_unet.mid_block.resnets[-1].out_channels, out_h, out_w) ) baseline_sample_unet_inputs["down_block_additional_residuals"] = () for i, shape in enumerate(additional_residuals_shapes): sample_residual_input = torch.rand(*shape) sample_unet_inputs[f"additional_residual_{i}"] = sample_residual_input if i == len(additional_residuals_shapes) - 1: baseline_sample_unet_inputs["mid_block_additional_residual"] = sample_residual_input else: baseline_sample_unet_inputs["down_block_additional_residuals"] += (sample_residual_input, ) sample_unet_inputs_spec = { k: (v.shape, v.dtype) for k, v in sample_unet_inputs.items() } logger.info(f"Sample UNet inputs spec: {sample_unet_inputs_spec}") # JIT trace logger.info("JIT tracing..") reference_unet = torch.jit.trace(reference_unet, list(sample_unet_inputs.values())) logger.info("Done.") if args.check_output_correctness: baseline_out = pipe.unet.to(torch.float32)(**baseline_sample_unet_inputs, return_dict=False)[0].numpy() reference_out = reference_unet(*sample_unet_inputs.values())[0].numpy() report_correctness(baseline_out, reference_out, "unet baseline to reference PyTorch") del pipe.unet gc.collect() coreml_sample_unet_inputs = { k: v.numpy().astype(np.float16) for k, v in sample_unet_inputs.items() } coreml_unet, out_path = _convert_to_coreml(unet_name, reference_unet, coreml_sample_unet_inputs, ["noise_pred"], args) del reference_unet gc.collect() # Set model metadata coreml_unet.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" if args.xl_version: coreml_unet.license = "OpenRAIL++-M (https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/LICENSE.md)" else: coreml_unet.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)" coreml_unet.version = args.model_version if model_name != "refiner" or not hasattr(args, "refiner_version") else args.refiner_version coreml_unet.short_description = \ "Stable Diffusion generates images conditioned on text or other images as input through the diffusion process. " \ "Please refer to https://arxiv.org/abs/2112.10752 for details." # Set the input descriptions coreml_unet.input_description["sample"] = \ "The low resolution latent feature maps being denoised through reverse diffusion" coreml_unet.input_description["timestep"] = \ "A value emitted by the associated scheduler object to condition the model on a given noise schedule" coreml_unet.input_description["encoder_hidden_states"] = \ "Output embeddings from the associated text_encoder model to condition to generated image on text. " \ "A maximum of 77 tokens (~40 words) are allowed. Longer text is truncated. " \ "Shorter text does not reduce computation." if args.xl_version: coreml_unet.input_description["time_ids"] = \ "Additional embeddings that if specified are added to the embeddings that are passed along to the UNet blocks." coreml_unet.input_description["text_embeds"] = \ "Additional embeddings from text_encoder_2 that if specified are added to the embeddings that are passed along to the UNet blocks." # Set the output descriptions coreml_unet.output_description["noise_pred"] = \ "Same shape and dtype as the `sample` input. " \ "The predicted noise to facilitate the reverse diffusion (denoising) process" # Set package version metadata from python_coreml_stable_diffusion._version import __version__ coreml_unet.user_defined_metadata["com.github.apple.ml-stable-diffusion.version"] = __version__ coreml_unet.save(out_path) logger.info(f"Saved unet into {out_path}") # Parity check PyTorch vs CoreML if args.check_output_correctness: coreml_out = list( coreml_unet.predict(coreml_sample_unet_inputs).values())[0] report_correctness(baseline_out, coreml_out, "unet baseline PyTorch to reference CoreML") del coreml_unet gc.collect() else: del pipe.unet gc.collect() logger.info( f"`unet` already exists at {out_path}, skipping conversion.") if args.chunk_unet and not unet_chunks_exist: logger.info(f"Chunking {model_name} in two approximately equal MLModels") args.mlpackage_path = out_path args.remove_original = False args.merge_chunks_in_pipeline_model = False chunk_mlprogram.main(args) def convert_mmdit(args): """ Converts the MMDiT component of Stable Diffusion 3 """ out_path = _get_out_path(args, "mmdit") if os.path.exists(out_path): logger.info( f"`mmdit` already exists at {out_path}, skipping conversion." ) return # Convert the MMDiT model via DiffusionKit converted_mmdit_path = convert_mmdit_to_mlpackage( model_version=args.model_version, latent_h=args.latent_h, latent_w=args.latent_w, output_dir=args.o, # FIXME: Hardcoding to CPU_AND_GPU since ANE doesn't support FLOAT32 compute_precision=ct.precision.FLOAT32, compute_unit=ct.ComputeUnit.CPU_AND_GPU, ) # Load converted model coreml_mmdit = ct.models.MLModel(converted_mmdit_path) # Set model metadata coreml_mmdit.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" coreml_mmdit.license = "Stability AI Community License (https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE.md)" coreml_mmdit.version = args.model_version coreml_mmdit.short_description = \ "Stable Diffusion 3 generates images conditioned on text or other images as input through the diffusion process. " \ "Please refer to https://arxiv.org/pdf/2403.03206 for details." # Set the input descriptions coreml_mmdit.input_description["latent_image_embeddings"] = \ "The low resolution latent feature maps being denoised through reverse diffusion" coreml_mmdit.input_description["token_level_text_embeddings"] = \ "Output embeddings from the associated text_encoder model to condition to generated image on text. " \ "A maximum of 77 tokens (~40 words) are allowed. Longer text is truncated. " coreml_mmdit.input_description["pooled_text_embeddings"] = \ "Additional embeddings that if specified are added to the embeddings that are passed along to the MMDiT model." coreml_mmdit.input_description["timestep"] = \ "A value emitted by the associated scheduler object to condition the model on a given noise schedule" # Set the output descriptions coreml_mmdit.output_description["denoiser_output"] = \ "Same shape and dtype as the `latent_image_embeddings` input. " \ "The predicted noise to facilitate the reverse diffusion (denoising) process" # Set package version metadata from python_coreml_stable_diffusion._version import __version__ coreml_mmdit.user_defined_metadata["com.github.apple.ml-stable-diffusion.version"] = __version__ from diffusionkit.version import __version__ coreml_mmdit.user_defined_metadata["com.github.argmax.diffusionkit.version"] = __version__ # Save the updated model coreml_mmdit.save(out_path) logger.info(f"Saved vae_decoder into {out_path}") # Delete the original file if os.path.exists(converted_mmdit_path): shutil.rmtree(converted_mmdit_path) del coreml_mmdit gc.collect() def convert_safety_checker(pipe, args): """ Converts the Safety Checker component of Stable Diffusion """ if pipe.safety_checker is None: logger.warning( f"diffusers pipeline for {args.model_version} does not have a `safety_checker` module! " \ "`--convert-safety-checker` will be ignored." ) return out_path = _get_out_path(args, "safety_checker") if os.path.exists(out_path): logger.info( f"`safety_checker` already exists at {out_path}, skipping conversion." ) return pipe.safety_checker = pipe.safety_checker.to(torch.float32) im_h = pipe.vae.config.sample_size im_w = pipe.vae.config.sample_size if args.latent_h is not None: im_h = args.latent_h * 8 if args.latent_w is not None: im_w = args.latent_w * 8 sample_image = np.random.randn( 1, # B im_h, # H im_w, # w 3 # C ).astype(np.float32) # Note that pipe.feature_extractor is not an ML model. It simply # preprocesses data for the pipe.safety_checker module. safety_checker_input = pipe.feature_extractor( pipe.numpy_to_pil(sample_image), return_tensors="pt", ).pixel_values.to(torch.float32) sample_safety_checker_inputs = OrderedDict([ ("clip_input", safety_checker_input), ("images", torch.from_numpy(sample_image)), ("adjustment", torch.tensor([0]).to(torch.float32)), ]) sample_safety_checker_inputs_spec = { k: (v.shape, v.dtype) for k, v in sample_safety_checker_inputs.items() } logger.info(f"Sample inputs spec: {sample_safety_checker_inputs_spec}") # Patch safety_checker's forward pass to be vectorized and avoid conditional blocks # (similar to pipe.safety_checker.forward_onnx) from diffusers.pipelines.stable_diffusion import safety_checker def forward_coreml(self, clip_input, images, adjustment): """ Forward pass implementation for safety_checker """ def cosine_distance(image_embeds, text_embeds): return F.normalize(image_embeds) @ F.normalize( text_embeds).transpose(0, 1) pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds) cos_dist = cosine_distance(image_embeds, self.concept_embeds) special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment special_care = special_scores.gt(0).float().sum(dim=1).gt(0).float() special_adjustment = special_care * 0.01 special_adjustment = special_adjustment.unsqueeze(1).expand( -1, cos_dist.shape[1]) concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment has_nsfw_concepts = concept_scores.gt(0).float().sum(dim=1).gt(0)[:, None, None, None] has_nsfw_concepts_inds, _ = torch.broadcast_tensors( has_nsfw_concepts, images) images[has_nsfw_concepts_inds] = 0.0 # black image return images, has_nsfw_concepts.float(), concept_scores baseline_safety_checker = deepcopy(pipe.safety_checker.eval()) setattr(baseline_safety_checker, "forward", MethodType(forward_coreml, baseline_safety_checker)) # In order to parity check the actual signal, we need to override the forward pass to return `concept_scores` which is the # output before thresholding # Reference: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py#L100 def forward_extended_return(self, clip_input, images, adjustment): def cosine_distance(image_embeds, text_embeds): normalized_image_embeds = F.normalize(image_embeds) normalized_text_embeds = F.normalize(text_embeds) return torch.mm(normalized_image_embeds, normalized_text_embeds.t()) pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds) cos_dist = cosine_distance(image_embeds, self.concept_embeds) adjustment = 0.0 special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment special_care = torch.any(special_scores > 0, dim=1) special_adjustment = special_care * 0.01 special_adjustment = special_adjustment.unsqueeze(1).expand( -1, cos_dist.shape[1]) concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment has_nsfw_concepts = torch.any(concept_scores > 0, dim=1) images[has_nsfw_concepts] = 0.0 return images, has_nsfw_concepts, concept_scores setattr(pipe.safety_checker, "forward", MethodType(forward_extended_return, pipe.safety_checker)) # Trace the safety_checker model logger.info("JIT tracing..") traced_safety_checker = torch.jit.trace( baseline_safety_checker, list(sample_safety_checker_inputs.values())) logger.info("Done.") del baseline_safety_checker gc.collect() # Cast all inputs to float16 coreml_sample_safety_checker_inputs = { k: v.numpy().astype(np.float16) for k, v in sample_safety_checker_inputs.items() } # Convert safety_checker model to Core ML coreml_safety_checker, out_path = _convert_to_coreml( "safety_checker", traced_safety_checker, coreml_sample_safety_checker_inputs, ["filtered_images", "has_nsfw_concepts", "concept_scores"], args) # Set model metadata coreml_safety_checker.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}" coreml_safety_checker.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)" coreml_safety_checker.version = args.model_version coreml_safety_checker.short_description = \ "Stable Diffusion generates images conditioned on text and/or other images as input through the diffusion process. " \ "Please refer to https://arxiv.org/abs/2112.10752 for details." # Set the input descriptions coreml_safety_checker.input_description["clip_input"] = \ "The normalized image input tensor resized to (224x224) in channels-first (BCHW) format" coreml_safety_checker.input_description["images"] = \ f"Output of the vae_decoder ({pipe.vae.config.sample_size}x{pipe.vae.config.sample_size}) in channels-last (BHWC) format" coreml_safety_checker.input_description["adjustment"] = \ "Bias added to the concept scores to trade off increased recall for reduce precision in the safety checker classifier" # Set the output descriptions coreml_safety_checker.output_description["filtered_images"] = \ f"Identical to the input `images`. If safety checker detected any sensitive content, " \ "the corresponding image is replaced with a blank image (zeros)" coreml_safety_checker.output_description["has_nsfw_concepts"] = \ "Indicates whether the safety checker model found any sensitive content in the given image" coreml_safety_checker.output_description["concept_scores"] = \ "Concept scores are the scores before thresholding at zero yields the `has_nsfw_concepts` output. " \ "These scores can be used to tune the `adjustment` input" coreml_safety_checker.save(out_path) if args.check_output_correctness: baseline_out = pipe.safety_checker( **sample_safety_checker_inputs)[2].numpy() coreml_out = coreml_safety_checker.predict( coreml_sample_safety_checker_inputs)["concept_scores"] report_correctness( baseline_out, coreml_out, "safety_checker baseline PyTorch to reference CoreML") del traced_safety_checker, coreml_safety_checker, pipe.safety_checker gc.collect() def _get_controlnet_base_model(controlnet_model_version): from huggingface_hub import model_info info = model_info(controlnet_model_version) return info.cardData.get("base_model", None) def convert_controlnet(pipe, args): """ Converts each ControlNet for Stable Diffusion """ if not hasattr(pipe, "unet"): raise RuntimeError( "convert_unet() deletes pipe.unet to save RAM. " "Please use convert_vae_encoder() before convert_unet()") if not hasattr(pipe, "text_encoder"): raise RuntimeError( "convert_text_encoder() deletes pipe.text_encoder to save RAM. " "Please use convert_unet() before convert_text_encoder()") for i, controlnet_model_version in enumerate(args.convert_controlnet): base_model = _get_controlnet_base_model(controlnet_model_version) if base_model is None and args.model_version != "runwayml/stable-diffusion-v1-5": logger.warning( f"The original ControlNet models were trained using Stable Diffusion v1.5. " f"It is possible that model {args.model_version} is not compatible with controlnet.") if base_model is not None and base_model != args.model_version: raise RuntimeError( f"ControlNet model {controlnet_model_version} was trained using " f"Stable Diffusion model {base_model}.\n However, you specified " f"version {args.model_version} in the command line. Please, use " f"--model-version {base_model} to convert this model.") controlnet_model_name = controlnet_model_version.replace("/", "_") fname = f"ControlNet_{controlnet_model_name}.mlpackage" out_path = os.path.join(args.o, fname) if os.path.exists(out_path): logger.info( f"`controlnet_{controlnet_model_name}` already exists at {out_path}, skipping conversion." ) continue if i == 0: batch_size = 2 # for classifier-free guidance sample_shape = ( batch_size, # B pipe.unet.config.in_channels, # C (args.latent_h or pipe.unet.config.sample_size), # H (args.latent_w or pipe.unet.config.sample_size), # W ) encoder_hidden_states_shape = ( batch_size, args.text_encoder_hidden_size or pipe.text_encoder.config.hidden_size, 1, args.text_token_sequence_length or pipe.text_encoder.config.max_position_embeddings, ) controlnet_cond_shape = ( batch_size, # B 3, # C (args.latent_h or pipe.unet.config.sample_size) * 8, # H (args.latent_w or pipe.unet.config.sample_size) * 8, # w ) # Create the scheduled timesteps for downstream use DEFAULT_NUM_INFERENCE_STEPS = 50 pipe.scheduler.set_timesteps(DEFAULT_NUM_INFERENCE_STEPS) # Prepare inputs sample_controlnet_inputs = OrderedDict([ ("sample", torch.rand(*sample_shape)), ("timestep", torch.tensor([pipe.scheduler.timesteps[0].item()] * (batch_size)).to(torch.float32)), ("encoder_hidden_states", torch.rand(*encoder_hidden_states_shape)), ("controlnet_cond", torch.rand(*controlnet_cond_shape)), ]) sample_controlnet_inputs_spec = { k: (v.shape, v.dtype) for k, v in sample_controlnet_inputs.items() } logger.info( f"Sample ControlNet inputs spec: {sample_controlnet_inputs_spec}") baseline_sample_controlnet_inputs = deepcopy(sample_controlnet_inputs) baseline_sample_controlnet_inputs[ "encoder_hidden_states"] = baseline_sample_controlnet_inputs[ "encoder_hidden_states"].squeeze(2).transpose(1, 2) # Import controlnet model and initialize reference controlnet original_controlnet = ControlNetModel.from_pretrained( controlnet_model_version, use_auth_token=True ) reference_controlnet = controlnet.ControlNetModel(**original_controlnet.config).eval() load_state_dict_summary = reference_controlnet.load_state_dict( original_controlnet.state_dict()) num_residuals = reference_controlnet.get_num_residuals() output_keys = [f"additional_residual_{i}" for i in range(num_residuals)] # JIT trace logger.info("JIT tracing..") reference_controlnet = torch.jit.trace(reference_controlnet, list(sample_controlnet_inputs.values())) logger.info("Done.") if args.check_output_correctness: baseline_out = original_controlnet(**baseline_sample_controlnet_inputs, return_dict=False) reference_out = reference_controlnet(*sample_controlnet_inputs.values()) report_correctness( baseline_out[-1].numpy(), reference_out[-1].numpy(), f"{controlnet_model_name} baseline to reference PyTorch") del original_controlnet gc.collect() coreml_sample_controlnet_inputs = { k: v.numpy().astype(np.float16) for k, v in sample_controlnet_inputs.items() } coreml_controlnet, out_path = _convert_to_coreml(f"controlnet_{controlnet_model_name}", reference_controlnet, coreml_sample_controlnet_inputs, output_keys, args, out_path=out_path) del reference_controlnet gc.collect() coreml_controlnet.author = f"Please refer to the Model Card available at huggingface.co/{controlnet_model_version}" coreml_controlnet.license = "OpenRAIL (https://huggingface.co/spaces/CompVis/stable-diffusion-license)" coreml_controlnet.version = controlnet_model_version coreml_controlnet.short_description = \ "ControlNet is a neural network structure to control diffusion models by adding extra conditions. " \ "Please refer to https://arxiv.org/abs/2302.05543 for details." # Set the input descriptions coreml_controlnet.input_description["sample"] = \ "The low resolution latent feature maps being denoised through reverse diffusion" coreml_controlnet.input_description["timestep"] = \ "A value emitted by the associated scheduler object to condition the model on a given noise schedule" coreml_controlnet.input_description["encoder_hidden_states"] = \ "Output embeddings from the associated text_encoder model to condition to generated image on text. " \ "A maximum of 77 tokens (~40 words) are allowed. Longer text is truncated. " \ "Shorter text does not reduce computation." coreml_controlnet.input_description["controlnet_cond"] = \ "An additional input image for ControlNet to condition the generated images." # Set the output descriptions for i in range(num_residuals): coreml_controlnet.output_description[f"additional_residual_{i}"] = \ "One of the outputs of each downsampling block in ControlNet. " \ "The value added to the corresponding resnet output in UNet." coreml_controlnet.save(out_path) logger.info(f"Saved controlnet into {out_path}") # Parity check PyTorch vs CoreML if args.check_output_correctness: coreml_out = coreml_controlnet.predict(coreml_sample_controlnet_inputs) report_correctness( baseline_out[-1].numpy(), coreml_out[output_keys[-1]], "controlnet baseline PyTorch to reference CoreML" ) del coreml_controlnet gc.collect() def get_pipeline(args): model_version = args.model_version logger.info(f"Initializing DiffusionPipeline with {model_version}..") if args.custom_vae_version: from diffusers import AutoencoderKL vae = AutoencoderKL.from_pretrained(args.custom_vae_version, torch_dtype=torch.float16) pipe = DiffusionPipeline.from_pretrained(model_version, torch_dtype=torch.float16, variant="fp16", use_safetensors=True, vae=vae, use_auth_token=True) elif args.sd3_version: # SD3 uses standard SDXL diffusers pipeline besides the vae, denoiser, and T5 text encoder sdxl_base_version = "stabilityai/stable-diffusion-xl-base-1.0" args.xl_version = True logger.info(f"SD3 version specified, initializing DiffusionPipeline with {sdxl_base_version} for non-SD3 components..") pipe = DiffusionPipeline.from_pretrained(sdxl_base_version, torch_dtype=torch.float16, variant="fp16", use_safetensors=True, use_auth_token=True) else: pipe = DiffusionPipeline.from_pretrained(model_version, torch_dtype=torch.float16, variant="fp16", use_safetensors=True, use_auth_token=True) logger.info(f"Done. Pipeline in effect: {pipe.__class__.__name__}") return pipe def main(args): os.makedirs(args.o, exist_ok=True) # Instantiate diffusers pipe as reference pipe = get_pipeline(args) # Register the selected attention implementation globally unet.ATTENTION_IMPLEMENTATION_IN_EFFECT = unet.AttentionImplementations[ args.attention_implementation] logger.info( f"Attention implementation in effect: {unet.ATTENTION_IMPLEMENTATION_IN_EFFECT}" ) # Convert models if args.convert_vae_decoder: logger.info("Converting vae_decoder") if args.sd3_version: convert_vae_decoder_sd3(args) else: convert_vae_decoder(pipe, args) logger.info("Converted vae_decoder") if args.convert_vae_encoder: logger.info("Converting vae_encoder") convert_vae_encoder(pipe, args) logger.info("Converted vae_encoder") if args.convert_controlnet: logger.info("Converting controlnet") convert_controlnet(pipe, args) logger.info("Converted controlnet") if args.convert_unet: logger.info("Converting unet") convert_unet(pipe, args) logger.info("Converted unet") if args.convert_text_encoder and hasattr(pipe, "text_encoder") and pipe.text_encoder is not None: logger.info("Converting text_encoder") convert_text_encoder(pipe.text_encoder, pipe.tokenizer, "text_encoder", args) del pipe.text_encoder logger.info("Converted text_encoder") if args.convert_text_encoder and hasattr(pipe, "text_encoder_2") and pipe.text_encoder_2 is not None: logger.info("Converting text_encoder_2") convert_text_encoder(pipe.text_encoder_2, pipe.tokenizer_2, "text_encoder_2", args) del pipe.text_encoder_2 logger.info("Converted text_encoder_2") if args.convert_safety_checker: logger.info("Converting safety_checker") convert_safety_checker(pipe, args) logger.info("Converted safety_checker") if args.convert_unet and args.refiner_version is not None: logger.info(f"Converting refiner") del pipe gc.collect() original_model_version = args.model_version args.model_version = args.refiner_version pipe = get_pipeline(args) args.model_version = original_model_version convert_unet(pipe, args, model_name="refiner") del pipe gc.collect() logger.info(f"Converted refiner") if args.convert_mmdit: logger.info("Converting mmdit") convert_mmdit(args) logger.info("Converted mmdit") if args.quantize_nbits is not None: logger.info(f"Quantizing weights to {args.quantize_nbits}-bit precision") quantize_weights(args) logger.info(f"Quantized weights to {args.quantize_nbits}-bit precision") if args.bundle_resources_for_swift_cli: logger.info("Bundling resources for the Swift CLI") bundle_resources_for_swift_cli(args) logger.info("Bundled resources for the Swift CLI") def parser_spec(): parser = argparse.ArgumentParser() # Select which models to export (All are needed for text-to-image pipeline to function) parser.add_argument("--convert-text-encoder", action="store_true") parser.add_argument("--convert-vae-decoder", action="store_true") parser.add_argument("--convert-vae-encoder", action="store_true") parser.add_argument("--convert-unet", action="store_true") parser.add_argument("--convert-mmdit", action="store_true") parser.add_argument("--convert-safety-checker", action="store_true") parser.add_argument( "--convert-controlnet", nargs="*", type=str, help= "Converts a ControlNet model hosted on HuggingFace to coreML format. " \ "To convert multiple models, provide their names separated by spaces.", ) parser.add_argument( "--model-version", required=True, help= ("The pre-trained model checkpoint and configuration to restore. " "For available versions: https://huggingface.co/models?search=stable-diffusion" )) parser.add_argument( "--refiner-version", default=None, help= ("The pre-trained refiner model checkpoint and configuration to restore. " "If specified, this argument will convert and bundle the refiner unet only alongside the model unet. " "If you would like to convert a refiner model on it's own, use the --model-version argument instead." "For available versions: https://huggingface.co/models?sort=trending&search=stable-diffusion+refiner" )) parser.add_argument( "--custom-vae-version", type=str, default=None, help= ("Custom VAE checkpoint to override the pipeline's built-in VAE. " "If specified, the specified VAE will be converted instead of the one associated to the `--model-version` checkpoint. " "No precision override is applied when using a custom VAE." )) parser.add_argument("--compute-unit", choices=tuple(cu for cu in ct.ComputeUnit._member_names_), default="ALL") parser.add_argument( "--latent-h", type=int, default=None, help= "The spatial resolution (number of rows) of the latent space. `Defaults to pipe.unet.config.sample_size`", ) parser.add_argument( "--latent-w", type=int, default=None, help= "The spatial resolution (number of cols) of the latent space. `Defaults to pipe.unet.config.sample_size`", ) parser.add_argument( "--text-token-sequence-length", type=int, default=None, help= "The token sequence length for the text encoder. `Defaults to pipe.text_encoder.config.max_position_embeddings`", ) parser.add_argument( "--text-encoder-hidden-size", type=int, default=None, help= "The hidden size for the text encoder. `Defaults to pipe.text_encoder.config.hidden_size`", ) parser.add_argument( "--attention-implementation", choices=tuple(ai for ai in unet.AttentionImplementations._member_names_), default=unet.ATTENTION_IMPLEMENTATION_IN_EFFECT.name, help= "The enumerated implementations trade off between ANE and GPU performance", ) parser.add_argument( "-o", default=os.getcwd(), help="The resulting mlpackages will be saved into this directory") parser.add_argument( "--check-output-correctness", action="store_true", help= "If specified, compares the outputs of original PyTorch and final CoreML models and reports PSNR in dB. " "Enabling this feature uses more memory. Disable it if your machine runs out of memory." ) parser.add_argument( "--chunk-unet", action="store_true", help= "If specified, generates two mlpackages out of the unet model which approximately equal weights sizes. " "This is required for ANE deployment on iOS and iPadOS. Not required for macOS." ) parser.add_argument( "--quantize-nbits", default=None, choices=(1, 2, 4, 6, 8), type=int, help="If specified, quantized each model to nbits precision" ) parser.add_argument( "--unet-support-controlnet", action="store_true", help= "If specified, enable unet to receive additional inputs from controlnet. " "Each input added to corresponding resnet output." ) parser.add_argument( "--unet-batch-one", action="store_true", help= "If specified, a batch size of one will be used for the unet, this is needed if you do not want to do " "classifier free guidance. Default unet batch size is two, which is needed for classifier free guidance." ) parser.add_argument("--include-t5", action="store_true") # Swift CLI Resource Bundling parser.add_argument( "--bundle-resources-for-swift-cli", action="store_true", help= "If specified, creates a resources directory compatible with the sample Swift CLI. " "It compiles all four models and adds them to a StableDiffusionResources directory " "along with a `vocab.json` and `merges.txt` for the text tokenizer") parser.add_argument( "--text-encoder-vocabulary-url", default= "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/vocab.json", help="The URL to the vocabulary file use by the text tokenizer") parser.add_argument( "--text-encoder-merges-url", default= "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txt", help="The URL to the merged pairs used in by the text tokenizer.") parser.add_argument( "--text-encoder-t5-url", default= "https://huggingface.co/argmaxinc/coreml-stable-diffusion-3-medium/resolve/main/TextEncoderT5.mlmodelc", help="The URL to the pre-converted T5 encoder model.") parser.add_argument( "--text-encoder-t5-config-url", default= "https://huggingface.co/google-t5/t5-small/resolve/main/tokenizer_config.json", help="The URL to the merged pairs used in by the text tokenizer.") parser.add_argument( "--text-encoder-t5-data-url", default= "https://huggingface.co/google-t5/t5-small/resolve/main/tokenizer.json", help="The URL to the merged pairs used in by the text tokenizer.") parser.add_argument( "--min-deployment-target", default="macOS13", help=( "Minimum deployment target for Core ML models. " "Valid options include macOS13, macOS14, macOS15, iOS16, iOS17, iOS18. " "For iOS 18 compatibility with advanced quantization features, use iOS18. " "Default is macOS13 for backwards compatibility." ) ) parser.add_argument( "--xl-version", action="store_true", help=("If specified, the pre-trained model will be treated as an instantiation of " "`diffusers.pipelines.StableDiffusionXLPipeline` instead of `diffusers.pipelines.StableDiffusionPipeline`")) parser.add_argument( "--sd3-version", action="store_true", help=("If specified, the pre-trained model will be treated as an SD3 model.")) return parser if __name__ == "__main__": parser = parser_spec() args = parser.parse_args() main(args) ================================================ FILE: python_coreml_stable_diffusion/unet.py ================================================ # # For licensing see accompanying LICENSE.md file. # Copyright (C) 2022 Apple Inc. All Rights Reserved. # from python_coreml_stable_diffusion.layer_norm import LayerNormANE from python_coreml_stable_diffusion import attention from diffusers.configuration_utils import ConfigMixin, register_to_config from diffusers import ModelMixin from enum import Enum import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) import math import torch import torch.nn as nn import torch.nn.functional as F # Ensure minimum macOS version requirement is met for this particular model from coremltools.models.utils import _macos_version if not _macos_version() >= (13, 1): logger.warning( "!!! macOS 13.1 and newer or iOS/iPadOS 16.2 and newer is required for best performance !!!" ) class AttentionImplementations(Enum): ORIGINAL = "ORIGINAL" SPLIT_EINSUM = "SPLIT_EINSUM" SPLIT_EINSUM_V2 = "SPLIT_EINSUM_V2" ATTENTION_IMPLEMENTATION_IN_EFFECT = AttentionImplementations.SPLIT_EINSUM WARN_MSG = \ "This `nn.Module` is intended for Apple Silicon deployment only. " \ "PyTorch-specific optimizations and training is disabled" class Einsum(nn.Module): def __init__(self, heads, dim_head): super().__init__() self.heads = heads self.dim_head = dim_head def forward(self, q, k, v, mask): if ATTENTION_IMPLEMENTATION_IN_EFFECT == AttentionImplementations.ORIGINAL: return attention.original(q, k, v, mask, self.heads, self.dim_head) elif ATTENTION_IMPLEMENTATION_IN_EFFECT == AttentionImplementations.SPLIT_EINSUM: return attention.split_einsum(q, k, v, mask, self.heads, self.dim_head) elif ATTENTION_IMPLEMENTATION_IN_EFFECT == AttentionImplementations.SPLIT_EINSUM_V2: return attention.split_einsum_v2(q, k, v, mask, self.heads, self.dim_head) class CrossAttention(nn.Module): """ Apple Silicon friendly version of `diffusers.models.attention.CrossAttention` """ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64): super().__init__() inner_dim = dim_head * heads context_dim = context_dim if context_dim is not None else query_dim self.scale = dim_head**-0.5 self.heads = heads self.dim_head = dim_head self.to_q = nn.Conv2d(query_dim, inner_dim, kernel_size=1, bias=False) self.to_k = nn.Conv2d(context_dim, inner_dim, kernel_size=1, bias=False) self.to_v = nn.Conv2d(context_dim, inner_dim, kernel_size=1, bias=False) self.to_out = nn.Sequential( nn.Conv2d(inner_dim, query_dim, kernel_size=1, bias=True)) self.einsum = Einsum(self.heads, self.dim_head) def forward(self, hidden_states, context=None, mask=None): # if self.training: # raise NotImplementedError(WARN_MSG) batch_size, dim, _, sequence_length = hidden_states.shape q = self.to_q(hidden_states) context = context if context is not None else hidden_states k = self.to_k(context) v = self.to_v(context) # Validate mask if mask is not None: expected_mask_shape = [batch_size, sequence_length, 1, 1] if mask.dtype == torch.bool: mask = mask.logical_not().float() * -1e4 elif mask.dtype == torch.int64: mask = (1 - mask).float() * -1e4 elif mask.dtype != torch.float32: raise TypeError(f"Unexpected dtype for mask: {mask.dtype}") if len(mask.size()) == 2: mask = mask.unsqueeze(2).unsqueeze(2) if list(mask.size()) != expected_mask_shape: raise RuntimeError( f"Invalid shape for `mask` (Expected {expected_mask_shape}, got {list(mask.size())}" ) attn = self.einsum(q, k, v, mask) return self.to_out(attn) def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): """ Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights """ for k in state_dict: if 'weight' in k and len(state_dict[k].shape) == 2: state_dict[k] = state_dict[k][:, :, None, None] # Note: torch.nn.LayerNorm and ane_transformers.reference.layer_norm.LayerNormANE # apply scale and bias terms in opposite orders. In order to accurately restore a # state_dict trained using the former into the the latter, we adjust the bias term def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): state_dict[prefix + "bias"] = state_dict[prefix + "bias"] / state_dict[prefix + "weight"] return state_dict class LayerNormANE(LayerNormANE): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._register_load_state_dict_pre_hook( correct_for_bias_scale_order_inversion) # Reference: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py # (modified, e.g. the attention implementation) class CrossAttnUpBlock2D(nn.Module): def __init__( self, in_channels, out_channels, prev_output_channel, temb_channels, num_layers=1, resnet_eps=1e-6, resnet_time_scale_shift="default", resnet_act_fn="swish", resnet_groups=32, attn_num_head_channels=1, cross_attention_dim=768, attention_type="default", output_scale_factor=1.0, downsample_padding=1, add_upsample=True, transformer_layers_per_block=1, ): super().__init__() resnets = [] attentions = [] self.attention_type = attention_type self.attn_num_head_channels = attn_num_head_channels for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( ResnetBlock2D( in_channels=resnet_in_channels + res_skip_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, time_embedding_norm=resnet_time_scale_shift, )) attentions.append( SpatialTransformer( out_channels, attn_num_head_channels, out_channels // attn_num_head_channels, depth=transformer_layers_per_block, context_dim=cross_attention_dim, )) self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) self.upsamplers = None if add_upsample: self.upsamplers = nn.ModuleList([Upsample2D(out_channels)]) def forward(self, hidden_states, res_hidden_states_tuple, temb=None, encoder_hidden_states=None): for resnet, attn in zip(self.resnets, self.attentions): res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) hidden_states = resnet(hidden_states, temb) hidden_states = attn(hidden_states, context=encoder_hidden_states) if self.upsamplers is not None: for upsampler in self.upsamplers: hidden_states = upsampler(hidden_states) return hidden_states class UpBlock2D(nn.Module): def __init__( self, in_channels, prev_output_channel, out_channels, temb_channels, num_layers=1, resnet_eps=1e-6, resnet_time_scale_shift="default", resnet_act_fn="swish", resnet_groups=32, add_upsample=True, ): super().__init__() resnets = [] for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( ResnetBlock2D( in_channels=resnet_in_channels + res_skip_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, time_embedding_norm=resnet_time_scale_shift, )) self.resnets = nn.ModuleList(resnets) self.upsamplers = None if add_upsample: self.upsamplers = nn.ModuleList([Upsample2D(out_channels)]) def forward(self, hidden_states, res_hidden_states_tuple, temb=None): for resnet in self.resnets: res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) hidden_states = resnet(hidden_states, temb) if self.upsamplers is not None: for upsampler in self.upsamplers: hidden_states = upsampler(hidden_states) return hidden_states class CrossAttnDownBlock2D(nn.Module): def __init__( self, in_channels, out_channels, temb_channels, transformer_layers_per_block=1, num_layers=1, resnet_eps=1e-6, resnet_time_scale_shift="default", resnet_act_fn="swish", resnet_groups=32, attn_num_head_channels=1, cross_attention_dim=768, attention_type="default", output_scale_factor=1.0, downsample_padding=1, add_downsample=True, ): super().__init__() resnets = [] attentions = [] self.attention_type = attention_type self.attn_num_head_channels = attn_num_head_channels for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels resnets.append( ResnetBlock2D( in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, time_embedding_norm=resnet_time_scale_shift, )) attentions.append( SpatialTransformer( out_channels, attn_num_head_channels, out_channels // attn_num_head_channels, depth=transformer_layers_per_block, context_dim=cross_attention_dim, )) self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) if add_downsample: self.downsamplers = nn.ModuleList([Downsample2D(out_channels)]) else: self.downsamplers = None def forward(self, hidden_states, temb=None, encoder_hidden_states=None): output_states = () for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states, temb) hidden_states = attn(hidden_states, context=encoder_hidden_states) output_states += (hidden_states, ) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) output_states += (hidden_states, ) return hidden_states, output_states class DownBlock2D(nn.Module): def __init__( self, in_channels, out_channels, temb_channels, num_layers=1, resnet_eps=1e-6, resnet_time_scale_shift="default", resnet_act_fn="swish", resnet_groups=32, add_downsample=True, ): super().__init__() resnets = [] for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels resnets.append( ResnetBlock2D( in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, time_embedding_norm=resnet_time_scale_shift, )) self.resnets = nn.ModuleList(resnets) if add_downsample: self.downsamplers = nn.ModuleList([Downsample2D(out_channels)]) else: self.downsamplers = None def forward(self, hidden_states, temb=None): output_states = () for resnet in self.resnets: hidden_states = resnet(hidden_states, temb) output_states += (hidden_states, ) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) output_states = output_states + (hidden_states,) return hidden_states, output_states class ResnetBlock2D(nn.Module): def __init__( self, *, in_channels, out_channels=None, conv_shortcut=False, temb_channels=512, groups=32, groups_out=None, eps=1e-6, time_embedding_norm="default", use_nin_shortcut=None, ): super().__init__() self.in_channels = in_channels self.out_channels = in_channels if out_channels is None else out_channels self.use_conv_shortcut = conv_shortcut self.time_embedding_norm = time_embedding_norm if groups_out is None: groups_out = groups self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True) self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) if temb_channels is not None: self.time_emb_proj = torch.nn.Conv2d(temb_channels, out_channels, kernel_size=1) else: self.time_emb_proj = None self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True) self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) self.nonlinearity = nn.SiLU() self.use_nin_shortcut = self.in_channels != self.out_channels if use_nin_shortcut is None else use_nin_shortcut self.conv_shortcut = None if self.use_nin_shortcut: self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) def forward(self, x, temb): hidden_states = x hidden_states = self.norm1(hidden_states) hidden_states = self.nonlinearity(hidden_states) hidden_states = self.conv1(hidden_states) if temb is not None: temb = self.time_emb_proj(self.nonlinearity(temb)) hidden_states = hidden_states + temb hidden_states = self.norm2(hidden_states) hidden_states = self.nonlinearity(hidden_states) hidden_states = self.conv2(hidden_states) if self.conv_shortcut is not None: x = self.conv_shortcut(x) out = (x + hidden_states) return out class Upsample2D(nn.Module): def __init__(self, channels): super().__init__() self.conv = nn.Conv2d(channels, channels, 3, padding=1) def forward(self, x): x = F.interpolate(x, scale_factor=2.0, mode="nearest") return self.conv(x) class Downsample2D(nn.Module): def __init__(self, channels): super().__init__() self.conv = nn.Conv2d(channels, channels, 3, stride=2, padding=1) def forward(self, x): return self.conv(x) class SpatialTransformer(nn.Module): def __init__( self, in_channels, n_heads, d_head, depth=1, context_dim=None, ): super().__init__() self.n_heads = n_heads self.d_head = d_head self.in_channels = in_channels inner_dim = n_heads * d_head self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) self.transformer_blocks = nn.ModuleList([ BasicTransformerBlock(inner_dim, n_heads, d_head, context_dim=context_dim) for d in range(depth) ]) self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0) def forward(self, hidden_states, context=None): batch, channel, height, weight = hidden_states.shape residual = hidden_states hidden_states = self.norm(hidden_states) hidden_states = self.proj_in(hidden_states) hidden_states = hidden_states.view(batch, channel, 1, height * weight) for block in self.transformer_blocks: hidden_states = block(hidden_states, context=context) hidden_states = hidden_states.view(batch, channel, height, weight) hidden_states = self.proj_out(hidden_states) return hidden_states + residual class BasicTransformerBlock(nn.Module): def __init__(self, dim, n_heads, d_head, context_dim=None, gated_ff=True): super().__init__() self.attn1 = CrossAttention( query_dim=dim, heads=n_heads, dim_head=d_head, ) self.ff = FeedForward(dim, glu=gated_ff) self.attn2 = CrossAttention( query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, ) self.norm1 = LayerNormANE(dim) self.norm2 = LayerNormANE(dim) self.norm3 = LayerNormANE(dim) def forward(self, hidden_states, context=None): hidden_states = self.attn1(self.norm1(hidden_states)) + hidden_states hidden_states = self.attn2(self.norm2(hidden_states), context=context) + hidden_states hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states return hidden_states class FeedForward(nn.Module): def __init__(self, dim, dim_out=None, mult=4, glu=False): super().__init__() inner_dim = int(dim * mult) self.net = nn.Sequential( GEGLU(dim_in=dim, dim_out=inner_dim), nn.Identity(), nn.Conv2d(inner_dim, dim_out if dim_out is not None else dim, kernel_size=1)) def forward(self, hidden_states): return self.net(hidden_states) class GEGLU(nn.Module): def __init__(self, dim_in, dim_out): super().__init__() self.proj = nn.Conv2d(dim_in, dim_out * 2, kernel_size=1) def forward(self, hidden_states): hidden_states, gate = self.proj(hidden_states).chunk(2, dim=1) return hidden_states * F.gelu(gate) def get_activation(act_fn): if act_fn in ["swish", "silu"]: return nn.SiLU() elif act_fn == "mish": return nn.Mish() elif act_fn == "gelu": return nn.GELU() else: raise ValueError(f"Unsupported activation function: {act_fn}") class TimestepEmbedding(nn.Module): def __init__( self, in_channels, time_embed_dim, act_fn = "silu", out_dim = None, post_act_fn = None, cond_proj_dim=None, ): super().__init__() self.linear_1 = nn.Conv2d( in_channels, time_embed_dim, kernel_size=1) if cond_proj_dim is not None: self.cond_proj = nn.Conv2d( cond_proj_dim, in_channels, kernel_size=1, bias=False) else: self.cond_proj = None self.act = get_activation(act_fn) if out_dim is not None: time_embed_dim_out = out_dim else: time_embed_dim_out = time_embed_dim self.linear_2 = nn.Conv2d( time_embed_dim, time_embed_dim_out, kernel_size=1) if post_act_fn is None: self.post_act = None else: self.post_act = get_activation(post_act_fn) def forward(self, sample, condition=None): if len(sample.shape) == 2: sample = sample.unsqueeze(-1).unsqueeze(-1) if condition is not None: if len(condition.shape) == 2: condition = condition.unsqueeze(-1).unsqueeze(-1) sample = sample + self.cond_proj(condition) sample = self.linear_1(sample) if self.act is not None: sample = self.act(sample) sample = self.linear_2(sample) if self.post_act is not None: sample = self.post_act(sample) return sample class Timesteps(nn.Module): def __init__(self, num_channels, flip_sin_to_cos, downscale_freq_shift): super().__init__() self.num_channels = num_channels self.flip_sin_to_cos = flip_sin_to_cos self.downscale_freq_shift = downscale_freq_shift def forward(self, timesteps): t_emb = get_timestep_embedding( timesteps, self.num_channels, flip_sin_to_cos=self.flip_sin_to_cos, downscale_freq_shift=self.downscale_freq_shift, ) return t_emb def get_timestep_embedding( timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, scale=1, max_period=10000, ): assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array" half_dim = embedding_dim // 2 exponent = -math.log(max_period) * torch.arange( start=0, end=half_dim, dtype=torch.float32) exponent = exponent / (half_dim - downscale_freq_shift) emb = torch.exp(exponent).to(device=timesteps.device) emb = timesteps[:, None].float() * emb[None, :] emb = scale * emb emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) if flip_sin_to_cos: emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1) if embedding_dim % 2 == 1: emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) return emb class UNetMidBlock2DCrossAttn(nn.Module): def __init__( self, in_channels, temb_channels, num_layers=1, resnet_eps=1e-6, resnet_time_scale_shift="default", resnet_act_fn="swish", resnet_groups=32, attn_num_head_channels=1, attention_type="default", cross_attention_dim=768, transformer_layers_per_block=1, **kwargs, ): super().__init__() self.attention_type = attention_type self.attn_num_head_channels = attn_num_head_channels resnet_groups = resnet_groups if resnet_groups is not None else min( in_channels // 4, 32) resnets = [ ResnetBlock2D( in_channels=in_channels, out_channels=in_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, time_embedding_norm=resnet_time_scale_shift, ) ] attentions = [] for _ in range(num_layers): attentions.append( SpatialTransformer( in_channels, attn_num_head_channels, in_channels // attn_num_head_channels, depth=transformer_layers_per_block, context_dim=cross_attention_dim, )) resnets.append( ResnetBlock2D( in_channels=in_channels, out_channels=in_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, time_embedding_norm=resnet_time_scale_shift, )) self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) def forward(self, hidden_states, temb=None, encoder_hidden_states=None): hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): hidden_states = attn(hidden_states, encoder_hidden_states) hidden_states = resnet(hidden_states, temb) return hidden_states class UNet2DConditionModel(ModelMixin, ConfigMixin): @register_to_config def __init__( self, sample_size=None, in_channels=4, out_channels=4, center_input_sample=False, flip_sin_to_cos=True, freq_shift=0, down_block_types=( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type="UNetMidBlock2DCrossAttn", up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), only_cross_attention=False, block_out_channels=(320, 640, 1280, 1280), layers_per_block=2, downsample_padding=1, mid_block_scale_factor=1, act_fn="silu", norm_num_groups=32, norm_eps=1e-5, cross_attention_dim=768, transformer_layers_per_block=1, attention_head_dim=8, addition_embed_type=None, addition_time_embed_dim=None, projection_class_embeddings_input_dim=None, support_controlnet=False, **kwargs, ): if kwargs.get("dual_cross_attention", None): raise NotImplementedError if kwargs.get("num_classs_embeds", None): raise NotImplementedError if only_cross_attention: raise NotImplementedError if kwargs.get("use_linear_projection", None): logger.warning("`use_linear_projection=True` is ignored!") super().__init__() self._register_load_state_dict_pre_hook(linear_to_conv2d_map) self.config.time_cond_proj_dim = None self.support_controlnet = support_controlnet self.sample_size = sample_size time_embed_dim = block_out_channels[0] * 4 # input self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1)) # time time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) self.time_proj = time_proj self.time_embedding = time_embedding self.encoder_hid_proj = None if addition_embed_type == "text": raise NotImplementedError elif addition_embed_type == "text_image": raise NotImplementedError elif addition_embed_type == "text_time": self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift) self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) elif addition_embed_type == "image": raise NotImplementedError elif addition_embed_type == "image_hint": raise NotImplementedError elif addition_embed_type is not None: raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") self.down_blocks = nn.ModuleList([]) self.mid_block = None self.up_blocks = nn.ModuleList([]) if isinstance(only_cross_attention, bool): only_cross_attention = [only_cross_attention] * len(down_block_types) if isinstance(attention_head_dim, int): attention_head_dim = (attention_head_dim,) * len(down_block_types) if isinstance(transformer_layers_per_block, int): transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) # down output_channel = block_out_channels[0] for i, down_block_type in enumerate(down_block_types): input_channel = output_channel output_channel = block_out_channels[i] is_final_block = i == len(block_out_channels) - 1 down_block = get_down_block( down_block_type, transformer_layers_per_block=transformer_layers_per_block[i], num_layers=layers_per_block, in_channels=input_channel, out_channels=output_channel, temb_channels=time_embed_dim, resnet_eps=norm_eps, resnet_act_fn=act_fn, cross_attention_dim=cross_attention_dim, attn_num_head_channels=attention_head_dim[i], downsample_padding=downsample_padding, add_downsample=not is_final_block, ) self.down_blocks.append(down_block) # mid assert mid_block_type == "UNetMidBlock2DCrossAttn" self.mid_block = UNetMidBlock2DCrossAttn( in_channels=block_out_channels[-1], transformer_layers_per_block=transformer_layers_per_block[-1], temb_channels=time_embed_dim, resnet_eps=norm_eps, resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, resnet_time_scale_shift="default", cross_attention_dim=cross_attention_dim, attn_num_head_channels=attention_head_dim[i], resnet_groups=norm_num_groups, ) # up reversed_block_out_channels = list(reversed(block_out_channels)) reversed_attention_head_dim = list(reversed(attention_head_dim)) reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block)) output_channel = reversed_block_out_channels[0] for i, up_block_type in enumerate(up_block_types): prev_output_channel = output_channel output_channel = reversed_block_out_channels[i] input_channel = reversed_block_out_channels[min( i + 1, len(block_out_channels) - 1)] is_final_block = i == len(block_out_channels) - 1 up_block = get_up_block( up_block_type, num_layers=layers_per_block + 1, transformer_layers_per_block=reversed_transformer_layers_per_block[i], in_channels=input_channel, out_channels=output_channel, prev_output_channel=prev_output_channel, temb_channels=time_embed_dim, add_upsample=not is_final_block, resnet_eps=norm_eps, resnet_act_fn=act_fn, cross_attention_dim=cross_attention_dim, attn_num_head_channels=reversed_attention_head_dim[i], ) self.up_blocks.append(up_block) prev_output_channel = output_channel # out self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps) self.conv_act = nn.SiLU() self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1) def forward( self, sample, timestep, encoder_hidden_states, *additional_residuals, ): # 0. Project (or look-up) time embeddings t_emb = self.time_proj(timestep) emb = self.time_embedding(t_emb) # 1. center input if necessary if self.config.center_input_sample: sample = 2 * sample - 1.0 # 2. pre-process sample = self.conv_in(sample) # 3. down down_block_res_samples = (sample, ) for downsample_block in self.down_blocks: if hasattr( downsample_block, "attentions") and downsample_block.attentions is not None: sample, res_samples = downsample_block( hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states) else: sample, res_samples = downsample_block(hidden_states=sample, temb=emb) down_block_res_samples += res_samples if self.support_controlnet: new_down_block_res_samples = () for i, down_block_res_sample in enumerate(down_block_res_samples): down_block_res_sample = down_block_res_sample + additional_residuals[i] new_down_block_res_samples += (down_block_res_sample,) down_block_res_samples = new_down_block_res_samples # 4. mid sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states) if self.support_controlnet: sample = sample + additional_residuals[-1] # 5. up for upsample_block in self.up_blocks: res_samples = down_block_res_samples[-len(upsample_block.resnets):] down_block_res_samples = down_block_res_samples[:-len( upsample_block.resnets)] if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None: sample = upsample_block( hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, encoder_hidden_states=encoder_hidden_states, ) else: sample = upsample_block(hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples) # 6. post-process sample = self.conv_norm_out(sample) sample = self.conv_act(sample) sample = self.conv_out(sample) return (sample, ) class UNet2DConditionModelXL(UNet2DConditionModel): """ UNet2DConditionModel variant for Stable Diffusion XL with an extended forward() signature """ def forward( self, sample, timestep, encoder_hidden_states, time_ids, text_embeds, *additional_residuals, ): # 0. Project time embeddings t_emb = self.time_proj(timestep) emb = self.time_embedding(t_emb) aug_emb = None if self.config.addition_embed_type == "text": raise NotImplementedError elif self.config.addition_embed_type == "text_image": raise NotImplementedError elif self.config.addition_embed_type == "text_time": assert time_ids is not None assert text_embeds is not None time_embeds = self.add_time_proj(time_ids.flatten()) time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) aug_emb = self.add_embedding(add_embeds) elif self.config.addition_embed_type == "image": raise NotImplementedError elif self.config.addition_embed_type == "image_hint": raise NotImplementedError emb = emb + aug_emb if aug_emb is not None else emb # 1. center input if necessary if self.config.center_input_sample: sample = 2 * sample - 1.0 # 2. pre-process sample = self.conv_in(sample) # 3. down down_block_res_samples = (sample, ) for downsample_block in self.down_blocks: if hasattr( downsample_block, "attentions") and downsample_block.attentions is not None: sample, res_samples = downsample_block( hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states) else: sample, res_samples = downsample_block(hidden_states=sample, temb=emb) down_block_res_samples += res_samples if self.support_controlnet: new_down_block_res_samples = () for i, down_block_res_sample in enumerate(down_block_res_samples): down_block_res_sample = down_block_res_sample + additional_residuals[i] new_down_block_res_samples += (down_block_res_sample,) down_block_res_samples = new_down_block_res_samples # 4. mid sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states) if self.support_controlnet: sample = sample + additional_residuals[-1] # 5. up for upsample_block in self.up_blocks: res_samples = down_block_res_samples[-len(upsample_block.resnets):] down_block_res_samples = down_block_res_samples[:-len( upsample_block.resnets)] if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None: sample = upsample_block( hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, encoder_hidden_states=encoder_hidden_states, ) else: sample = upsample_block(hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples) # 6. post-process sample = self.conv_norm_out(sample) sample = self.conv_act(sample) sample = self.conv_out(sample) return (sample, ) def get_down_block( down_block_type, num_layers, in_channels, out_channels, temb_channels, resnet_eps, resnet_act_fn, attn_num_head_channels, transformer_layers_per_block=1, cross_attention_dim=None, downsample_padding=None, add_downsample=True, ): down_block_type = down_block_type[7:] if down_block_type.startswith( "UNetRes") else down_block_type if down_block_type == "DownBlock2D": return DownBlock2D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, add_downsample=add_downsample, ) elif down_block_type == "CrossAttnDownBlock2D": if cross_attention_dim is None: raise ValueError( "cross_attention_dim must be specified for CrossAttnDownBlock2D" ) return CrossAttnDownBlock2D( num_layers=num_layers, transformer_layers_per_block=transformer_layers_per_block, in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, downsample_padding=downsample_padding, cross_attention_dim=cross_attention_dim, attn_num_head_channels=attn_num_head_channels, add_downsample=add_downsample, ) def get_up_block( up_block_type, num_layers, in_channels, out_channels, prev_output_channel, temb_channels, add_upsample, resnet_eps, resnet_act_fn, attn_num_head_channels, transformer_layers_per_block=1, cross_attention_dim=None, ): up_block_type = up_block_type[7:] if up_block_type.startswith( "UNetRes") else up_block_type if up_block_type == "UpBlock2D": return UpBlock2D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, prev_output_channel=prev_output_channel, temb_channels=temb_channels, add_upsample=add_upsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, ) elif up_block_type == "CrossAttnUpBlock2D": if cross_attention_dim is None: raise ValueError( "cross_attention_dim must be specified for CrossAttnUpBlock2D") return CrossAttnUpBlock2D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, prev_output_channel=prev_output_channel, temb_channels=temb_channels, add_upsample=add_upsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, cross_attention_dim=cross_attention_dim, attn_num_head_channels=attn_num_head_channels, transformer_layers_per_block=transformer_layers_per_block, ) raise ValueError(f"{up_block_type} does not exist.") def calculate_conv2d_output_shape(in_h, in_w, conv2d_layer): k_h, k_w = conv2d_layer.kernel_size pad_h, pad_w = conv2d_layer.padding stride_h, stride_w = conv2d_layer.stride out_h = math.floor((in_h + 2 * pad_h - k_h) / stride_h + 1) out_w = math.floor((in_w + 2 * pad_w - k_w) / stride_w + 1) return out_h, out_w ================================================ FILE: requirements.txt ================================================ coremltools>=8.0 diffusers[torch]==0.30.2 diffusionkit==0.4.0 torch transformers==4.44.2 scipy scikit-learn pytest invisible-watermark safetensors matplotlib ================================================ FILE: setup.py ================================================ from setuptools import setup, find_packages from python_coreml_stable_diffusion._version import __version__ with open('README.md') as f: readme = f.read() setup( name='python_coreml_stable_diffusion', version=__version__, url='https://github.com/apple/ml-stable-diffusion', description="Run Stable Diffusion on Apple Silicon with Core ML (Python and Swift)", long_description=readme, long_description_content_type='text/markdown', author='Apple Inc.', install_requires=[ "coremltools>=8.0", "diffusers[torch]==0.30.2", "torch", "transformers==4.44.2", "huggingface-hub==0.24.6", "scipy", "numpy<1.24", "pytest", "scikit-learn", "invisible-watermark", "safetensors", "matplotlib", "diffusionkit==0.4.0", ], packages=find_packages(), classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Operating System :: MacOS :: MacOS X", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Topic :: Artificial Intelligence", "Topic :: Scientific/Engineering", "Topic :: Software Development", ], ) ================================================ FILE: swift/StableDiffusion/pipeline/CGImage+vImage.swift ================================================ // For licensing see accompanying LICENSE.md file. // Copyright (C) 2022 Apple Inc. All Rights Reserved. import Foundation import Accelerate import CoreML import CoreGraphics @available(iOS 16.0, macOS 13.0, *) extension CGImage { typealias PixelBufferPFx1 = vImage.PixelBuffer typealias PixelBufferP8x3 = vImage.PixelBuffer typealias PixelBufferIFx3 = vImage.PixelBuffer typealias PixelBufferI8x3 = vImage.PixelBuffer public enum ShapedArrayError: String, Swift.Error { case wrongNumberOfChannels case incorrectFormatsConvertingToShapedArray case vImageConverterNotInitialized } public static func fromShapedArray(_ array: MLShapedArray) throws -> CGImage { // array is [N,C,H,W], where C==3 let channelCount = array.shape[1] guard channelCount == 3 else { throw ShapedArrayError.wrongNumberOfChannels } let height = array.shape[2] let width = array.shape[3] // Normalize each channel into a float between 0 and 1.0 let floatChannels = (0.. [0.0 1.0] cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut) } return cOut } // Convert to interleaved and then to UInt8 let floatImage = PixelBufferIFx3(planarBuffers: floatChannels) let uint8Image = PixelBufferI8x3(width: width, height: height) floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips // Convert to uint8x3 to RGB CGImage (no alpha) let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue) let cgImage = uint8Image.makeCGImage(cgImageFormat: .init(bitsPerComponent: 8, bitsPerPixel: 3*8, colorSpace: CGColorSpace(name: CGColorSpace.sRGB) ?? CGColorSpaceCreateDeviceRGB(), bitmapInfo: bitmapInfo)!)! return cgImage } public func planarRGBShapedArray(minValue: Float, maxValue: Float) throws -> MLShapedArray { guard var sourceFormat = vImage_CGImageFormat(cgImage: self), var mediumFormat = vImage_CGImageFormat( bitsPerComponent: 8 * MemoryLayout.size, bitsPerPixel: 8 * MemoryLayout.size * 4, colorSpace: CGColorSpaceCreateDeviceRGB(), bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.first.rawValue)), let width = vImagePixelCount(exactly: self.width), let height = vImagePixelCount(exactly: self.height) else { throw ShapedArrayError.incorrectFormatsConvertingToShapedArray } var sourceImageBuffer = try vImage_Buffer(cgImage: self) var mediumDestination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel) let converter = vImageConverter_CreateWithCGImageFormat( &sourceFormat, &mediumFormat, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole), nil) guard let converter = converter?.takeRetainedValue() else { throw ShapedArrayError.vImageConverterNotInitialized } vImageConvert_AnyToAny(converter, &sourceImageBuffer, &mediumDestination, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole)) var destinationA = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) var minFloat: [Float] = Array(repeating: minValue, count: 4) var maxFloat: [Float] = Array(repeating: maxValue, count: 4) vImageConvert_ARGB8888toPlanarF(&mediumDestination, &destinationA, &destinationR, &destinationG, &destinationB, &maxFloat, &minFloat, .zero) let destAPtr = destinationA.data.assumingMemoryBound(to: Float.self) let destRPtr = destinationR.data.assumingMemoryBound(to: Float.self) let destGPtr = destinationG.data.assumingMemoryBound(to: Float.self) let destBPtr = destinationB.data.assumingMemoryBound(to: Float.self) for i in 0..(data: imageData, shape: [1, 3, self.height, self.width]) return shapedArray } private func normalizePixelValues(pixel: UInt8) -> Float { return (Float(pixel) / 127.5) - 1.0 } public func toRGBShapedArray(minValue: Float, maxValue: Float) throws -> MLShapedArray { let image = self let width = image.width let height = image.height let alphaMaskValue: Float = minValue guard let colorSpace = CGColorSpace(name: CGColorSpace.sRGB), let context = CGContext(data: nil, width: width, height: height, bitsPerComponent: 8, bytesPerRow: 4 * width, space: colorSpace, bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue), let ptr = context.data?.bindMemory(to: UInt8.self, capacity: width * height * 4) else { return [] } context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height)) var redChannel = [Float](repeating: 0, count: width * height) var greenChannel = [Float](repeating: 0, count: width * height) var blueChannel = [Float](repeating: 0, count: width * height) for y in 0..(scalars: redChannel, shape: colorShape) let greenShapedArray = MLShapedArray(scalars: greenChannel, shape: colorShape) let blueShapedArray = MLShapedArray(scalars: blueChannel, shape: colorShape) let shapedArray = MLShapedArray(concatenating: [redShapedArray, greenShapedArray, blueShapedArray], alongAxis: 1) return shapedArray } } extension vImage_Buffer { func unpaddedData() -> Data { let bytesPerPixel = self.rowBytes / Int(self.width) let bytesPerRow = Int(self.width) * bytesPerPixel var contiguousPixelData = Data(capacity: bytesPerRow * Int(self.height)) for row in 0..