[
  {
    "path": ".gitignore",
    "content": "wandb/\n*debug*\ndebugs/\noutputs/\nsamples/\n__pycache__/\nossutil_output/\n.ossutil_checkpoint/\n\nscripts/*\n!scripts/animate.py\n\n*.ipynb\n*.safetensors\n*.ckpt\n\nmodels/*\n!models/StableDiffusion/\nmodels/StableDiffusion/*\n!models/StableDiffusion/*.txt\n!models/Motion_Module/\n!models/Motion_Module/*.txt\n!models/DreamBooth_LoRA/\n!models/DreamBooth_LoRA/*.txt\n!models/MotionLoRA/\n!models/MotionLoRA/*.txt\n"
  },
  {
    "path": "LICENSE.txt",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "# AnimateDiff\n\nThis repository is the official implementation of [AnimateDiff](https://arxiv.org/abs/2307.04725) [ICLR2024 Spotlight].\nIt is a plug-and-play module turning most community text-to-image models into animation generators, without the need of additional training.\n\n**[AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning](https://arxiv.org/abs/2307.04725)** \n</br>\n[Yuwei Guo](https://guoyww.github.io/),\n[Ceyuan Yang✝](https://ceyuan.me/),\n[Anyi Rao](https://anyirao.com/),\n[Zhengyang Liang](https://maxleung99.github.io/),\n[Yaohui Wang](https://wyhsirius.github.io/),\n[Yu Qiao](https://scholar.google.com.hk/citations?user=gFtI-8QAAAAJ),\n[Maneesh Agrawala](https://graphics.stanford.edu/~maneesh/),\n[Dahua Lin](http://dahua.site),\n[Bo Dai](https://daibo.info)\n(✝Corresponding Author)  \n[![arXiv](https://img.shields.io/badge/arXiv-2307.04725-b31b1b.svg)](https://arxiv.org/abs/2307.04725)\n[![Project Page](https://img.shields.io/badge/Project-Website-green)](https://animatediff.github.io/)\n[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Masbfca/AnimateDiff)\n[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/guoyww/AnimateDiff)\n\n***Note:*** The `main` branch is for [Stable Diffusion V1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5); for [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), please refer `sdxl-beta` branch.\n\n\n## Quick Demos\nMore results can be found in the [Gallery](__assets__/docs/gallery.md).\nSome of them are contributed by the community.\n\n<table class=\"center\">\n    <tr>\n    <td><img src=\"__assets__/animations/model_01/01.gif\"></td>\n    <td><img src=\"__assets__/animations/model_01/02.gif\"></td>\n    <td><img src=\"__assets__/animations/model_01/03.gif\"></td>\n    <td><img src=\"__assets__/animations/model_01/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model：<a href=\"https://civitai.com/models/30240/toonyou\">ToonYou</a></p>\n\n<table>\n    <tr>\n    <td><img src=\"__assets__/animations/model_03/01.gif\"></td>\n    <td><img src=\"__assets__/animations/model_03/02.gif\"></td>\n    <td><img src=\"__assets__/animations/model_03/03.gif\"></td>\n    <td><img src=\"__assets__/animations/model_03/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model：<a href=\"https://civitai.com/models/4201/realistic-vision-v20\">Realistic Vision V2.0</a></p>\n\n\n## Quick Start\n***Note:*** AnimateDiff is also offically supported by Diffusers.\nVisit [AnimateDiff Diffusers Tutorial](https://huggingface.co/docs/diffusers/api/pipelines/animatediff) for more details.\n*Following instructions is for working with this repository*.\n\n***Note:*** For all scripts, checkpoint downloading will be *automatically* handled, so the script running may take longer time when first executed.\n\n### 1. Setup repository and environment\n\n```\ngit clone https://github.com/guoyww/AnimateDiff.git\ncd AnimateDiff\n\npip install -r requirements.txt\n```\n\n### 2. Launch the sampling script!\nThe generated samples can be found in `samples/` folder.\n\n#### 2.1 Generate animations with comunity models\n```\npython -m scripts.animate --config configs/prompts/1_animate/1_1_animate_RealisticVision.yaml\npython -m scripts.animate --config configs/prompts/1_animate/1_2_animate_FilmVelvia.yaml\npython -m scripts.animate --config configs/prompts/1_animate/1_3_animate_ToonYou.yaml\npython -m scripts.animate --config configs/prompts/1_animate/1_4_animate_MajicMix.yaml\npython -m scripts.animate --config configs/prompts/1_animate/1_5_animate_RcnzCartoon.yaml\npython -m scripts.animate --config configs/prompts/1_animate/1_6_animate_Lyriel.yaml\npython -m scripts.animate --config configs/prompts/1_animate/1_7_animate_Tusun.yaml\n```\n\n#### 2.2 Generate animation with MotionLoRA control\n```\npython -m scripts.animate --config configs/prompts/2_motionlora/2_motionlora_RealisticVision.yaml\n```\n\n#### 2.3 More control with SparseCtrl RGB and sketch\n```\npython -m scripts.animate --config configs/prompts/3_sparsectrl/3_1_sparsectrl_i2v.yaml\npython -m scripts.animate --config configs/prompts/3_sparsectrl/3_2_sparsectrl_rgb_RealisticVision.yaml\npython -m scripts.animate --config configs/prompts/3_sparsectrl/3_3_sparsectrl_sketch_RealisticVision.yaml\n```\n\n#### 2.4 Gradio app\nWe created a Gradio demo to make AnimateDiff easier to use. \nBy default, the demo will run at `localhost:7860`.\n```\npython -u app.py\n```\n<img src=\"__assets__/figs/gradio.jpg\" style=\"width: 75%\">\n\n\n## Technical Explanation\n<details close>\n<summary>Technical Explanation</summary>\n\n### AnimateDiff\n\n**AnimateDiff aims to learn transferable motion priors that can be applied to other variants of Stable Diffusion family.**\nTo this end, we design the following training pipeline consisting of three stages.\n\n<img src=\"__assets__/figs/adapter_explain.png\" style=\"width:100%\">\n\n- In **1. Alleviate Negative Effects** stage, we train the **domain adapter**, e.g., `v3_sd15_adapter.ckpt`, to fit defective visual aritfacts (e.g., watermarks) in the training dataset.\nThis can also benefit the distangled learning of motion and spatial appearance.\nBy default, the adapter can be removed at inference. It can also be integrated into the model and its effects can be adjusted by a lora scaler.\n\n- In **2. Learn Motion Priors** stage, we train the **motion module**, e.g., `v3_sd15_mm.ckpt`, to learn the real-world motion patterns from videos.\n\n- In **3. (optional) Adapt to New Patterns** stage, we train **MotionLoRA**, e.g., `v2_lora_ZoomIn.ckpt`, to efficiently adapt motion module for specific motion patterns (camera zooming, rolling, etc.).\n\n### SparseCtrl\n\n**SparseCtrl aims to add more control to text-to-video models by adopting some sparse inputs (e.g., few RGB images or sketch inputs).**\nIts technicall details can be found in the following paper:\n\n**[SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933)**  \n[Yuwei Guo](https://guoyww.github.io/),\n[Ceyuan Yang✝](https://ceyuan.me/),\n[Anyi Rao](https://anyirao.com/),\n[Maneesh Agrawala](https://graphics.stanford.edu/~maneesh/),\n[Dahua Lin](http://dahua.site),\n[Bo Dai](https://daibo.info)\n(✝Corresponding Author)  \n[![arXiv](https://img.shields.io/badge/arXiv-2311.16933-b31b1b.svg)](https://arxiv.org/abs/2311.16933)\n[![Project Page](https://img.shields.io/badge/Project-Website-green)](https://guoyww.github.io/projects/SparseCtrl/)\n\n</details>\n\n\n## Model Versions\n<details close>\n<summary>Model Versions</summary>\n\n### AnimateDiff v3 and SparseCtrl (2023.12)\n\nIn this version, we use **Domain Adapter LoRA** for image model finetuning, which provides more flexiblity at inference.\nWe also implement two (RGB image/scribble) [SparseCtrl](https://arxiv.org/abs/2311.16933) encoders, which can take abitary number of condition maps to control the animation contents.\n\n<details close>\n<summary>AnimateDiff v3 Model Zoo</summary>\n\n| Name | HuggingFace | Type | Storage | Description |\n| - | - | - | - | - |\n| `v3_adapter_sd_v15.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_adapter.ckpt) | Domain Adapter | 97.4 MB | |\n| `v3_sd15_mm.ckpt.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_mm.ckpt) | Motion Module | 1.56 GB | |\n| `v3_sd15_sparsectrl_scribble.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_sparsectrl_scribble.ckpt) | SparseCtrl Encoder | 1.86 GB | scribble condition |\n| `v3_sd15_sparsectrl_rgb.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_sparsectrl_rgb.ckpt) | SparseCtrl Encoder | 1.85 GB | RGB image condition |\n</details>\n\n#### Limitations\n1. Small fickering is noticable;\n2. To stay compatible with comunity models, there is no specific optimizations for general T2V, leading to limited visual quality under this setting;\n3. **(Style Alignment) For usage such as image animation/interpolation, it's recommanded to use images generated by the same community model.**\n\n#### Demos\n<table class=\"center\">\n    <tr style=\"line-height: 0\">\n    <td width=25% style=\"border: none; text-align: center\">Input (by RealisticVision)</td>\n    <td width=25% style=\"border: none; text-align: center\">Animation</td>\n    <td width=25% style=\"border: none; text-align: center\">Input</td>\n    <td width=25% style=\"border: none; text-align: center\">Animation</td>\n    </tr>\n    <tr>\n    <td width=25% style=\"border: none\"><img src=\"__assets__/demos/image/RealisticVision_firework.png\" style=\"width:100%\"></td>\n    <td width=25% style=\"border: none\"><img src=\"__assets__/animations/v3/animation_fireworks.gif\" style=\"width:100%\"></td>\n    <td width=25% style=\"border: none\"><img src=\"__assets__/demos/image/RealisticVision_sunset.png\" style=\"width:100%\"></td>\n    <td width=25% style=\"border: none\"><img src=\"__assets__/animations/v3/animation_sunset.gif\" style=\"width:100%\"></td>\n    </tr>\n</table>\n\n<table class=\"center\">\n    <tr style=\"line-height: 0\">\n    <td width=25% style=\"border: none; text-align: center\">Input Scribble</td>\n    <td width=25% style=\"border: none; text-align: center\">Output</td>\n    <td width=25% style=\"border: none; text-align: center\">Input Scribbles</td>\n    <td width=25% style=\"border: none; text-align: center\">Output</td>\n    </tr>\n    <tr>\n      <td width=25% style=\"border: none\"><img src=\"__assets__/demos/scribble/scribble_1.png\" style=\"width:100%\"></td>\n      <td width=25% style=\"border: none\"><img src=\"__assets__/animations/v3/sketch_boy.gif\" style=\"width:100%\"></td>\n      <td width=25% style=\"border: none\"><img src=\"__assets__/demos/scribble/scribble_2_readme.png\" style=\"width:100%\"></td>\n      <td width=25% style=\"border: none\"><img src=\"__assets__/animations/v3/sketch_city.gif\" style=\"width:100%\"></td>\n    </tr>\n</table>\n\n\n### AnimateDiff SDXL-Beta (2023.11)\n\nRelease the Motion Module (beta version) on SDXL, available at [Google Drive](https://drive.google.com/file/d/1EK_D9hDOPfJdK4z8YDB8JYvPracNx2SX/view?usp=share_link\n) / [HuggingFace](https://huggingface.co/guoyww/animatediff/blob/main/mm_sdxl_v10_beta.ckpt\n) / [CivitAI](https://civitai.com/models/108836/animatediff-motion-modules). High resolution videos (i.e., 1024x1024x16 frames with various aspect ratios) could be produced **with/without** personalized models. Inference usually requires ~13GB VRAM and tuned hyperparameters (e.g., sampling steps), depending on the chosen personalized models.  \nCheckout to the branch [sdxl](https://github.com/guoyww/AnimateDiff/tree/sdxl) for more details of the inference.\n\n<details close>\n<summary>AnimateDiff SDXL-Beta Model Zoo</summary>\n\n| Name | HuggingFace | Type | Storage Space |\n| - | - | - | - |\n| `mm_sdxl_v10_beta.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/mm_sdxl_v10_beta.ckpt) | Motion Module | 950 MB |\n</details>\n\n#### Demos\n<table class=\"center\">\n    <tr style=\"line-height: 0\">\n    <td width=52% style=\"border: none; text-align: center\">Original SDXL</td>\n    <td width=30% style=\"border: none; text-align: center\">Community SDXL</td>\n    <td width=18% style=\"border: none; text-align: center\">Community SDXL</td>\n    </tr>\n    <tr>\n    <td width=52% style=\"border: none\"><img src=\"__assets__/animations/motion_xl/01.gif\" style=\"width:100%\"></td>\n    <td width=30% style=\"border: none\"><img src=\"__assets__/animations/motion_xl/02.gif\" style=\"width:100%\"></td>\n    <td width=18% style=\"border: none\"><img src=\"__assets__/animations/motion_xl/03.gif\" style=\"width:100%\"></td>\n    </tr>\n</table>\n\n\n### AnimateDiff v2 (2023.09)\n\nIn this version, the motion module `mm_sd_v15_v2.ckpt` ([Google Drive](https://drive.google.com/drive/folders/1EqLC65eR1-W-sGD0Im7fkED6c8GkiNFI?usp=sharing) / [HuggingFace](https://huggingface.co/guoyww/animatediff) / [CivitAI](https://civitai.com/models/108836/animatediff-motion-modules)) is trained upon larger resolution and batch size.\nWe found that the scale-up training significantly helps improve the motion quality and diversity.  \nWe also support **MotionLoRA** of eight basic camera movements.\nMotionLoRA checkpoints take up only **77 MB storage per model**, and are available at [Google Drive](https://drive.google.com/drive/folders/1EqLC65eR1-W-sGD0Im7fkED6c8GkiNFI?usp=sharing) / [HuggingFace](https://huggingface.co/guoyww/animatediff) / [CivitAI](https://civitai.com/models/108836/animatediff-motion-modules).\n\n<details close>\n<summary>AnimateDiff v2 Model Zoo</summary>\n\n| Name | HuggingFace | Type | Parameter | Storage |\n| - | - | - | - | - |\n| `mm_sd_v15_v2.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/mm_sd_v15_v2.ckpt) | Motion Module | 453 M | 1.7 GB |\n| `v2_lora_ZoomIn.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_ZoomIn.ckpt) | MotionLoRA | 19 M | 74 MB |\n| `v2_lora_ZoomOut.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_ZoomOut.ckpt) | MotionLoRA | 19 M | 74 MB |\n| `v2_lora_PanLeft.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_PanLeft.ckpt) | MotionLoRA | 19 M | 74 MB |\n| `v2_lora_PanRight.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_PanRight.ckpt) | MotionLoRA | 19 M | 74 MB |\n| `v2_lora_TiltUp.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_TiltUp.ckpt) | MotionLoRA | 19 M | 74 MB |\n| `v2_lora_TiltDown.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_TiltDown.ckpt) | MotionLoRA | 19 M | 74 MB |\n| `v2_lora_RollingClockwise.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_RollingClockwise.ckpt) | MotionLoRA | 19 M | 74 MB |\n| `v2_lora_RollingAnticlockwise.ckpt` | [Link](https://huggingface.co/guoyww/animatediff/blob/main/v2_lora_RollingAnticlockwise.ckpt) | MotionLoRA | 19 M | 74 MB |\n</details>\n\n\n#### Demos (MotionLoRA)\n<table class=\"center\">\n  <tr style=\"line-height: 0\">\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Zoom In</td>\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Zoom Out</td>\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Zoom Pan Left</td>\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Zoom Pan Right</td>\n  </tr>\n  <tr>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/01.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/02.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/02.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/01.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/03.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/04.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/04.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/03.gif\"></td>\n  </tr>\n  <tr style=\"line-height: 0\">\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Tilt Up</td>\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Tilt Down</td>\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Rolling Anti-Clockwise</td>\n    <td colspan=\"2\" style=\"border: none; text-align: center\">Rolling Clockwise</td>\n  </tr>\n  <tr>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/05.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/05.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/06.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/06.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/07.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/07.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_01/08.gif\"></td>\n    <td style=\"border: none\"><img src=\"__assets__/animations/motion_lora/model_02/08.gif\"></td>\n  </tr>\n</table>\n\n\n#### Demos (Improved Motions)\nHere's a comparison between `mm_sd_v15.ckpt` (left) and improved `mm_sd_v15_v2.ckpt` (right).\n\n<table class=\"center\">\n  <tr>\n    <td><img src=\"__assets__/animations/compare/old_0.gif\"></td>\n    <td><img src=\"__assets__/animations/compare/new_0.gif\"></td>\n    <td><img src=\"__assets__/animations/compare/old_1.gif\"></td>\n    <td><img src=\"__assets__/animations/compare/new_1.gif\"></td>\n    <td><img src=\"__assets__/animations/compare/old_2.gif\"></td>\n    <td><img src=\"__assets__/animations/compare/new_2.gif\"></td>\n    <td><img src=\"__assets__/animations/compare/old_3.gif\"></td>\n    <td><img src=\"__assets__/animations/compare/new_3.gif\"></td>\n  </tr>\n</table>\n\n\n### AnimateDiff v1 (2023.07)\n\nThe first version of AnimateDiff!\n\n<details close>\n<summary>AnimateDiff v1 Model Zoo</summary>\n\n| Name | HuggingFace | Parameter | Storage Space |\n| - | - | - | - |\n| mm_sd_v14.ckpt | [Link](https://huggingface.co/guoyww/animatediff/blob/main/mm_sd_v14.ckpt) | 417 M | 1.6 GB |\n| mm_sd_v15.ckpt | [Link](https://huggingface.co/guoyww/animatediff/blob/main/mm_sd_v15.ckpt) | 417 M | 1.6 GB |\n</details>\n\n</details>\n\n\n## Training\nPlease check [Steps for Training](__assets__/docs/animatediff.md) for details.\n\n\n## Related Resources\n\nAnimateDiff for Stable Diffusion WebUI: [sd-webui-animatediff](https://github.com/continue-revolution/sd-webui-animatediff) (by [@continue-revolution](https://github.com/continue-revolution))  \nAnimateDiff for ComfyUI: [ComfyUI-AnimateDiff-Evolved](https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved) (by [@Kosinkadink](https://github.com/Kosinkadink))  \nGoogle Colab: [Colab](https://colab.research.google.com/github/camenduru/AnimateDiff-colab/blob/main/AnimateDiff_colab.ipynb) (by [@camenduru](https://github.com/camenduru))\n\n\n## Disclaimer\nThis project is released for academic use.\nWe disclaim responsibility for user-generated content.\nAlso, please be advised that our only official website are https://github.com/guoyww/AnimateDiff and https://animatediff.github.io, and all the other websites are NOT associated with us at AnimateDiff. \n\n\n## Contact Us\nYuwei Guo: [guoyw@ie.cuhk.edu.hk](mailto:guoyw@ie.cuhk.edu.hk)  \nCeyuan Yang: [limbo0066@gmail.com](mailto:limbo0066@gmail.com)  \nBo Dai: [doubledaibo@gmail.com](mailto:doubledaibo@gmail.com)\n\n\n## BibTeX\n```\n@article{guo2023animatediff,\n  title={AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning},\n  author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Liang, Zhengyang and Wang, Yaohui and Qiao, Yu and Agrawala, Maneesh and Lin, Dahua and Dai, Bo},\n  journal={International Conference on Learning Representations},\n  year={2024}\n}\n\n@article{guo2023sparsectrl,\n  title={SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models},\n  author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Agrawala, Maneesh and Lin, Dahua and Dai, Bo},\n  journal={arXiv preprint arXiv:2311.16933},\n  year={2023}\n}\n```\n\n\n## Acknowledgements\nCodebase built upon [Tune-a-Video](https://github.com/showlab/Tune-A-Video).\n"
  },
  {
    "path": "__assets__/animations/compare/ffmpeg",
    "content": ""
  },
  {
    "path": "__assets__/docs/animatediff.md",
    "content": "## Steps for Training\n\n### Dataset\nBefore training, download the videos files and the `.csv` annotations of [WebVid10M](https://maxbain.com/webvid-dataset/) to the local mechine.\nNote that our examplar training script requires all the videos to be saved in a single folder. You may change this by modifying `animatediff/data/dataset.py`.\n\n### Configuration\nAfter dataset preparations, update the below data paths in the config `.yaml` files in `configs/training/` folder:\n```\ntrain_data:\n  csv_path: [Replace with .csv Annotation File Path]\n  video_folder: [Replace with Video Folder Path]\n  sample_size: 256\n```\nOther training parameters (lr, epochs, validation settings, etc.) are also included in the config files.\n\n### Training\nTo finetune the unet's image layers\n```\ntorchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/v1/image_finetune.yaml\n```\n\nTo train motion modules\n```\ntorchrun --nnodes=1 --nproc_per_node=1 train.py --config configs/training/v1/training.yaml\n```\n"
  },
  {
    "path": "__assets__/docs/gallery.md",
    "content": "# Gallery\nHere we demonstrate several best results we found in our experiments.\n\n<table class=\"center\">\n    <tr>\n    <td><img src=\"../animations/model_01/01.gif\"></td>\n    <td><img src=\"../animations/model_01/02.gif\"></td>\n    <td><img src=\"../animations/model_01/03.gif\"></td>\n    <td><img src=\"../animations/model_01/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model：<a href=\"https://civitai.com/models/30240/toonyou\">ToonYou</a></p>\n\n<table>\n    <tr>\n    <td><img src=\"../animations/model_02/01.gif\"></td>\n    <td><img src=\"../animations/model_02/02.gif\"></td>\n    <td><img src=\"../animations/model_02/03.gif\"></td>\n    <td><img src=\"../animations/model_02/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model：<a href=\"https://civitai.com/models/4468/counterfeit-v30\">Counterfeit V3.0</a></p>\n\n<table>\n    <tr>\n    <td><img src=\"../animations/model_03/01.gif\"></td>\n    <td><img src=\"../animations/model_03/02.gif\"></td>\n    <td><img src=\"../animations/model_03/03.gif\"></td>\n    <td><img src=\"../animations/model_03/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model：<a href=\"https://civitai.com/models/4201/realistic-vision-v20\">Realistic Vision V2.0</a></p>\n\n<table>\n    <tr>\n    <td><img src=\"../animations/model_04/01.gif\"></td>\n    <td><img src=\"../animations/model_04/02.gif\"></td>\n    <td><img src=\"../animations/model_04/03.gif\"></td>\n    <td><img src=\"../animations/model_04/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model： <a href=\"https://civitai.com/models/43331/majicmix-realistic\">majicMIX Realistic</a></p>\n\n<table>\n    <tr>\n    <td><img src=\"../animations/model_05/01.gif\"></td>\n    <td><img src=\"../animations/model_05/02.gif\"></td>\n    <td><img src=\"../animations/model_05/03.gif\"></td>\n    <td><img src=\"../animations/model_05/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model：<a href=\"https://civitai.com/models/66347/rcnz-cartoon-3d\">RCNZ Cartoon</a></p>\n\n<table>\n    <tr>\n    <td><img src=\"../animations/model_06/01.gif\"></td>\n    <td><img src=\"../animations/model_06/02.gif\"></td>\n    <td><img src=\"../animations/model_06/03.gif\"></td>\n    <td><img src=\"../animations/model_06/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">Model：<a href=\"https://civitai.com/models/33208/filmgirl-film-grain-lora-and-loha\">FilmVelvia</a></p>\n\n#### Community Cases\nHere are some samples contributed by the community artists. Create a Pull Request if you would like to show your results here😚.\n\n<table>\n    <tr>\n    <td><img src=\"../animations/model_07/init.jpg\"></td>\n    <td><img src=\"../animations/model_07/01.gif\"></td>\n    <td><img src=\"../animations/model_07/02.gif\"></td>\n    <td><img src=\"../animations/model_07/03.gif\"></td>\n    <td><img src=\"../animations/model_07/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">\nCharacter Model：<a href=\"https://civitai.com/models/13237/genshen-impact-yoimiya\">Yoimiya</a> \n(with an initial reference image, see <a href=\"https://github.com/talesofai/AnimateDiff\">WIP fork</a> for the extended implementation.)\n\n\n<table>\n    <tr>\n    <td><img src=\"../animations/model_08/01.gif\"></td>\n    <td><img src=\"../animations/model_08/02.gif\"></td>\n    <td><img src=\"../animations/model_08/03.gif\"></td>\n    <td><img src=\"../animations/model_08/04.gif\"></td>\n    </tr>\n</table>\n<p style=\"margin-left: 2em; margin-top: -1em\">\nCharacter Model：<a href=\"https://civitai.com/models/9850/paimon-genshin-impact\">Paimon</a>;\nPose Model：<a href=\"https://civitai.com/models/107295/or-holdingsign\">Hold Sign</a></p>\n\n\n"
  },
  {
    "path": "animatediff/data/dataset.py",
    "content": "import os, io, csv, math, random\nimport numpy as np\nfrom einops import rearrange\nfrom decord import VideoReader\n\nimport torch\nimport torchvision.transforms as transforms\nfrom torch.utils.data.dataset import Dataset\nfrom animatediff.utils.util import zero_rank_print\n\n\n\nclass WebVid10M(Dataset):\n    def __init__(\n            self,\n            csv_path, video_folder,\n            sample_size=256, sample_stride=4, sample_n_frames=16,\n            is_image=False,\n        ):\n        zero_rank_print(f\"loading annotations from {csv_path} ...\")\n        with open(csv_path, 'r') as csvfile:\n            self.dataset = list(csv.DictReader(csvfile))\n        self.length = len(self.dataset)\n        zero_rank_print(f\"data scale: {self.length}\")\n\n        self.video_folder    = video_folder\n        self.sample_stride   = sample_stride\n        self.sample_n_frames = sample_n_frames\n        self.is_image        = is_image\n        \n        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)\n        self.pixel_transforms = transforms.Compose([\n            transforms.RandomHorizontalFlip(),\n            transforms.Resize(sample_size[0]),\n            transforms.CenterCrop(sample_size),\n            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),\n        ])\n    \n    def get_batch(self, idx):\n        video_dict = self.dataset[idx]\n        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']\n        \n        video_dir    = os.path.join(self.video_folder, f\"{videoid}.mp4\")\n        video_reader = VideoReader(video_dir)\n        video_length = len(video_reader)\n        \n        if not self.is_image:\n            clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)\n            start_idx   = random.randint(0, video_length - clip_length)\n            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)\n        else:\n            batch_index = [random.randint(0, video_length - 1)]\n\n        pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()\n        pixel_values = pixel_values / 255.\n        del video_reader\n\n        if self.is_image:\n            pixel_values = pixel_values[0]\n        \n        return pixel_values, name\n\n    def __len__(self):\n        return self.length\n\n    def __getitem__(self, idx):\n        while True:\n            try:\n                pixel_values, name = self.get_batch(idx)\n                break\n\n            except Exception as e:\n                idx = random.randint(0, self.length-1)\n\n        pixel_values = self.pixel_transforms(pixel_values)\n        sample = dict(pixel_values=pixel_values, text=name)\n        return sample\n\n\n\nif __name__ == \"__main__\":\n    from animatediff.utils.util import save_videos_grid\n\n    dataset = WebVid10M(\n        csv_path=\"/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv\",\n        video_folder=\"/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val\",\n        sample_size=256,\n        sample_stride=4, sample_n_frames=16,\n        is_image=True,\n    )\n    import pdb\n    pdb.set_trace()\n    \n    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=16,)\n    for idx, batch in enumerate(dataloader):\n        print(batch[\"pixel_values\"].shape, len(batch[\"text\"]))\n        # for i in range(batch[\"pixel_values\"].shape[0]):\n        #     save_videos_grid(batch[\"pixel_values\"][i:i+1].permute(0,2,1,3,4), os.path.join(\".\", f\"{idx}-{i}.mp4\"), rescale=True)\n"
  },
  {
    "path": "animatediff/models/attention.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py\n\nfrom dataclasses import dataclass\nfrom typing import Optional\n\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\n\nfrom diffusers.configuration_utils import ConfigMixin, register_to_config\nfrom diffusers.modeling_utils import ModelMixin\nfrom diffusers.utils import BaseOutput\nfrom diffusers.utils.import_utils import is_xformers_available\nfrom diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm\n\nfrom einops import rearrange, repeat\nimport pdb\n\n@dataclass\nclass Transformer3DModelOutput(BaseOutput):\n    sample: torch.FloatTensor\n\n\nif is_xformers_available():\n    import xformers\n    import xformers.ops\nelse:\n    xformers = None\n\n\nclass Transformer3DModel(ModelMixin, ConfigMixin):\n    @register_to_config\n    def __init__(\n        self,\n        num_attention_heads: int = 16,\n        attention_head_dim: int = 88,\n        in_channels: Optional[int] = None,\n        num_layers: int = 1,\n        dropout: float = 0.0,\n        norm_num_groups: int = 32,\n        cross_attention_dim: Optional[int] = None,\n        attention_bias: bool = False,\n        activation_fn: str = \"geglu\",\n        num_embeds_ada_norm: Optional[int] = None,\n        use_linear_projection: bool = False,\n        only_cross_attention: bool = False,\n        upcast_attention: bool = False,\n\n        unet_use_cross_frame_attention=None,\n        unet_use_temporal_attention=None,\n    ):\n        super().__init__()\n        self.use_linear_projection = use_linear_projection\n        self.num_attention_heads = num_attention_heads\n        self.attention_head_dim = attention_head_dim\n        inner_dim = num_attention_heads * attention_head_dim\n\n        # Define input layers\n        self.in_channels = in_channels\n\n        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)\n        if use_linear_projection:\n            self.proj_in = nn.Linear(in_channels, inner_dim)\n        else:\n            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)\n\n        # Define transformers blocks\n        self.transformer_blocks = nn.ModuleList(\n            [\n                BasicTransformerBlock(\n                    inner_dim,\n                    num_attention_heads,\n                    attention_head_dim,\n                    dropout=dropout,\n                    cross_attention_dim=cross_attention_dim,\n                    activation_fn=activation_fn,\n                    num_embeds_ada_norm=num_embeds_ada_norm,\n                    attention_bias=attention_bias,\n                    only_cross_attention=only_cross_attention,\n                    upcast_attention=upcast_attention,\n\n                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n                    unet_use_temporal_attention=unet_use_temporal_attention,\n                )\n                for d in range(num_layers)\n            ]\n        )\n\n        # 4. Define output layers\n        if use_linear_projection:\n            self.proj_out = nn.Linear(in_channels, inner_dim)\n        else:\n            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)\n\n    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):\n        # Input\n        assert hidden_states.dim() == 5, f\"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}.\"\n        video_length = hidden_states.shape[2]\n        hidden_states = rearrange(hidden_states, \"b c f h w -> (b f) c h w\")\n        encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)\n\n        batch, channel, height, weight = hidden_states.shape\n        residual = hidden_states\n\n        hidden_states = self.norm(hidden_states)\n        if not self.use_linear_projection:\n            hidden_states = self.proj_in(hidden_states)\n            inner_dim = hidden_states.shape[1]\n            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)\n        else:\n            inner_dim = hidden_states.shape[1]\n            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)\n            hidden_states = self.proj_in(hidden_states)\n\n        # Blocks\n        for block in self.transformer_blocks:\n            hidden_states = block(\n                hidden_states,\n                encoder_hidden_states=encoder_hidden_states,\n                timestep=timestep,\n                video_length=video_length\n            )\n\n        # Output\n        if not self.use_linear_projection:\n            hidden_states = (\n                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()\n            )\n            hidden_states = self.proj_out(hidden_states)\n        else:\n            hidden_states = self.proj_out(hidden_states)\n            hidden_states = (\n                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()\n            )\n\n        output = hidden_states + residual\n\n        output = rearrange(output, \"(b f) c h w -> b c f h w\", f=video_length)\n        if not return_dict:\n            return (output,)\n\n        return Transformer3DModelOutput(sample=output)\n\n\nclass BasicTransformerBlock(nn.Module):\n    def __init__(\n        self,\n        dim: int,\n        num_attention_heads: int,\n        attention_head_dim: int,\n        dropout=0.0,\n        cross_attention_dim: Optional[int] = None,\n        activation_fn: str = \"geglu\",\n        num_embeds_ada_norm: Optional[int] = None,\n        attention_bias: bool = False,\n        only_cross_attention: bool = False,\n        upcast_attention: bool = False,\n\n        unet_use_cross_frame_attention = None,\n        unet_use_temporal_attention = None,\n    ):\n        super().__init__()\n        self.only_cross_attention = only_cross_attention\n        self.use_ada_layer_norm = num_embeds_ada_norm is not None\n        self.unet_use_cross_frame_attention = unet_use_cross_frame_attention\n        self.unet_use_temporal_attention = unet_use_temporal_attention\n\n        # SC-Attn\n        assert unet_use_cross_frame_attention is not None\n        if unet_use_cross_frame_attention:\n            self.attn1 = SparseCausalAttention2D(\n                query_dim=dim,\n                heads=num_attention_heads,\n                dim_head=attention_head_dim,\n                dropout=dropout,\n                bias=attention_bias,\n                cross_attention_dim=cross_attention_dim if only_cross_attention else None,\n                upcast_attention=upcast_attention,\n            )\n        else:\n            self.attn1 = CrossAttention(\n                query_dim=dim,\n                heads=num_attention_heads,\n                dim_head=attention_head_dim,\n                dropout=dropout,\n                bias=attention_bias,\n                upcast_attention=upcast_attention,\n            )\n        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)\n\n        # Cross-Attn\n        if cross_attention_dim is not None:\n            self.attn2 = CrossAttention(\n                query_dim=dim,\n                cross_attention_dim=cross_attention_dim,\n                heads=num_attention_heads,\n                dim_head=attention_head_dim,\n                dropout=dropout,\n                bias=attention_bias,\n                upcast_attention=upcast_attention,\n            )\n        else:\n            self.attn2 = None\n\n        if cross_attention_dim is not None:\n            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)\n        else:\n            self.norm2 = None\n\n        # Feed-forward\n        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)\n        self.norm3 = nn.LayerNorm(dim)\n\n        # Temp-Attn\n        assert unet_use_temporal_attention is not None\n        if unet_use_temporal_attention:\n            self.attn_temp = CrossAttention(\n                query_dim=dim,\n                heads=num_attention_heads,\n                dim_head=attention_head_dim,\n                dropout=dropout,\n                bias=attention_bias,\n                upcast_attention=upcast_attention,\n            )\n            nn.init.zeros_(self.attn_temp.to_out[0].weight.data)\n            self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)\n\n    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):\n        if not is_xformers_available():\n            print(\"Here is how to install it\")\n            raise ModuleNotFoundError(\n                \"Refer to https://github.com/facebookresearch/xformers for more information on how to install\"\n                \" xformers\",\n                name=\"xformers\",\n            )\n        elif not torch.cuda.is_available():\n            raise ValueError(\n                \"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only\"\n                \" available for GPU \"\n            )\n        else:\n            try:\n                # Make sure we can run the memory efficient attention\n                _ = xformers.ops.memory_efficient_attention(\n                    torch.randn((1, 2, 40), device=\"cuda\"),\n                    torch.randn((1, 2, 40), device=\"cuda\"),\n                    torch.randn((1, 2, 40), device=\"cuda\"),\n                )\n            except Exception as e:\n                raise e\n            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers\n            if self.attn2 is not None:\n                self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers\n            # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers\n\n    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None):\n        # SparseCausal-Attention\n        norm_hidden_states = (\n            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)\n        )\n\n        # if self.only_cross_attention:\n        #     hidden_states = (\n        #         self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states\n        #     )\n        # else:\n        #     hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states\n\n        # pdb.set_trace()\n        if self.unet_use_cross_frame_attention:\n            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states\n        else:\n            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states\n\n        if self.attn2 is not None:\n            # Cross-Attention\n            norm_hidden_states = (\n                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)\n            )\n            hidden_states = (\n                self.attn2(\n                    norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask\n                )\n                + hidden_states\n            )\n\n        # Feed-forward\n        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states\n\n        # Temporal-Attention\n        if self.unet_use_temporal_attention:\n            d = hidden_states.shape[1]\n            hidden_states = rearrange(hidden_states, \"(b f) d c -> (b d) f c\", f=video_length)\n            norm_hidden_states = (\n                self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)\n            )\n            hidden_states = self.attn_temp(norm_hidden_states) + hidden_states\n            hidden_states = rearrange(hidden_states, \"(b d) f c -> (b f) d c\", d=d)\n\n        return hidden_states\n"
  },
  {
    "path": "animatediff/models/motion_module.py",
    "content": "from dataclasses import dataclass\nfrom typing import List, Optional, Tuple, Union\n\nimport torch\nimport numpy as np\nimport torch.nn.functional as F\nfrom torch import nn\nimport torchvision\n\nfrom diffusers.configuration_utils import ConfigMixin, register_to_config\nfrom diffusers.modeling_utils import ModelMixin\nfrom diffusers.utils import BaseOutput\nfrom diffusers.utils.import_utils import is_xformers_available\nfrom diffusers.models.attention import CrossAttention, FeedForward\n\nfrom einops import rearrange, repeat\nimport math\n\n\ndef zero_module(module):\n    # Zero out the parameters of a module and return it.\n    for p in module.parameters():\n        p.detach().zero_()\n    return module\n\n\n@dataclass\nclass TemporalTransformer3DModelOutput(BaseOutput):\n    sample: torch.FloatTensor\n\n\nif is_xformers_available():\n    import xformers\n    import xformers.ops\nelse:\n    xformers = None\n\n\ndef get_motion_module(\n    in_channels,\n    motion_module_type: str, \n    motion_module_kwargs: dict\n):\n    if motion_module_type == \"Vanilla\":\n        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs,)    \n    else:\n        raise ValueError\n\n\nclass VanillaTemporalModule(nn.Module):\n    def __init__(\n        self,\n        in_channels,\n        num_attention_heads                = 8,\n        num_transformer_block              = 2,\n        attention_block_types              =( \"Temporal_Self\", \"Temporal_Self\" ),\n        cross_frame_attention_mode         = None,\n        temporal_position_encoding         = False,\n        temporal_position_encoding_max_len = 24,\n        temporal_attention_dim_div         = 1,\n        zero_initialize                    = True,\n    ):\n        super().__init__()\n        \n        self.temporal_transformer = TemporalTransformer3DModel(\n            in_channels=in_channels,\n            num_attention_heads=num_attention_heads,\n            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,\n            num_layers=num_transformer_block,\n            attention_block_types=attention_block_types,\n            cross_frame_attention_mode=cross_frame_attention_mode,\n            temporal_position_encoding=temporal_position_encoding,\n            temporal_position_encoding_max_len=temporal_position_encoding_max_len,\n        )\n        \n        if zero_initialize:\n            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)\n\n    def forward(self, input_tensor, temb, encoder_hidden_states, attention_mask=None, anchor_frame_idx=None):\n        hidden_states = input_tensor\n        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)\n\n        output = hidden_states\n        return output\n\n\nclass TemporalTransformer3DModel(nn.Module):\n    def __init__(\n        self,\n        in_channels,\n        num_attention_heads,\n        attention_head_dim,\n\n        num_layers,\n        attention_block_types              = ( \"Temporal_Self\", \"Temporal_Self\", ),        \n        dropout                            = 0.0,\n        norm_num_groups                    = 32,\n        cross_attention_dim                = 768,\n        activation_fn                      = \"geglu\",\n        attention_bias                     = False,\n        upcast_attention                   = False,\n        \n        cross_frame_attention_mode         = None,\n        temporal_position_encoding         = False,\n        temporal_position_encoding_max_len = 24,\n    ):\n        super().__init__()\n\n        inner_dim = num_attention_heads * attention_head_dim\n\n        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)\n        self.proj_in = nn.Linear(in_channels, inner_dim)\n\n        self.transformer_blocks = nn.ModuleList(\n            [\n                TemporalTransformerBlock(\n                    dim=inner_dim,\n                    num_attention_heads=num_attention_heads,\n                    attention_head_dim=attention_head_dim,\n                    attention_block_types=attention_block_types,\n                    dropout=dropout,\n                    norm_num_groups=norm_num_groups,\n                    cross_attention_dim=cross_attention_dim,\n                    activation_fn=activation_fn,\n                    attention_bias=attention_bias,\n                    upcast_attention=upcast_attention,\n                    cross_frame_attention_mode=cross_frame_attention_mode,\n                    temporal_position_encoding=temporal_position_encoding,\n                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,\n                )\n                for d in range(num_layers)\n            ]\n        )\n        self.proj_out = nn.Linear(inner_dim, in_channels)    \n    \n    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):\n        assert hidden_states.dim() == 5, f\"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}.\"\n        video_length = hidden_states.shape[2]\n        hidden_states = rearrange(hidden_states, \"b c f h w -> (b f) c h w\")\n\n        batch, channel, height, weight = hidden_states.shape\n        residual = hidden_states\n\n        hidden_states = self.norm(hidden_states)\n        inner_dim = hidden_states.shape[1]\n        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)\n        hidden_states = self.proj_in(hidden_states)\n\n        # Transformer Blocks\n        for block in self.transformer_blocks:\n            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length)\n        \n        # output\n        hidden_states = self.proj_out(hidden_states)\n        hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()\n\n        output = hidden_states + residual\n        output = rearrange(output, \"(b f) c h w -> b c f h w\", f=video_length)\n        \n        return output\n\n\nclass TemporalTransformerBlock(nn.Module):\n    def __init__(\n        self,\n        dim,\n        num_attention_heads,\n        attention_head_dim,\n        attention_block_types              = ( \"Temporal_Self\", \"Temporal_Self\", ),\n        dropout                            = 0.0,\n        norm_num_groups                    = 32,\n        cross_attention_dim                = 768,\n        activation_fn                      = \"geglu\",\n        attention_bias                     = False,\n        upcast_attention                   = False,\n        cross_frame_attention_mode         = None,\n        temporal_position_encoding         = False,\n        temporal_position_encoding_max_len = 24,\n    ):\n        super().__init__()\n\n        attention_blocks = []\n        norms = []\n        \n        for block_name in attention_block_types:\n            attention_blocks.append(\n                VersatileAttention(\n                    attention_mode=block_name.split(\"_\")[0],\n                    cross_attention_dim=cross_attention_dim if block_name.endswith(\"_Cross\") else None,\n                    \n                    query_dim=dim,\n                    heads=num_attention_heads,\n                    dim_head=attention_head_dim,\n                    dropout=dropout,\n                    bias=attention_bias,\n                    upcast_attention=upcast_attention,\n        \n                    cross_frame_attention_mode=cross_frame_attention_mode,\n                    temporal_position_encoding=temporal_position_encoding,\n                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,\n                )\n            )\n            norms.append(nn.LayerNorm(dim))\n            \n        self.attention_blocks = nn.ModuleList(attention_blocks)\n        self.norms = nn.ModuleList(norms)\n\n        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)\n        self.ff_norm = nn.LayerNorm(dim)\n\n\n    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):\n        for attention_block, norm in zip(self.attention_blocks, self.norms):\n            norm_hidden_states = norm(hidden_states)\n            hidden_states = attention_block(\n                norm_hidden_states,\n                encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,\n                video_length=video_length,\n            ) + hidden_states\n            \n        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states\n        \n        output = hidden_states  \n        return output\n\n\nclass PositionalEncoding(nn.Module):\n    def __init__(\n        self, \n        d_model, \n        dropout = 0., \n        max_len = 24\n    ):\n        super().__init__()\n        self.dropout = nn.Dropout(p=dropout)\n        position = torch.arange(max_len).unsqueeze(1)\n        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))\n        pe = torch.zeros(1, max_len, d_model)\n        pe[0, :, 0::2] = torch.sin(position * div_term)\n        pe[0, :, 1::2] = torch.cos(position * div_term)\n        self.register_buffer('pe', pe, persistent=False)\n\n    def forward(self, x):\n        x = x + self.pe[:, :x.size(1)]\n        return self.dropout(x)\n\n\nclass VersatileAttention(CrossAttention):\n    def __init__(\n            self,\n            attention_mode                     = None,\n            cross_frame_attention_mode         = None,\n            temporal_position_encoding         = False,\n            temporal_position_encoding_max_len = 32,            \n            *args, **kwargs\n        ):\n        super().__init__(*args, **kwargs)\n        assert attention_mode == \"Temporal\"\n\n        self.attention_mode = attention_mode\n        self.is_cross_attention = kwargs[\"cross_attention_dim\"] is not None\n        \n        self.pos_encoder = PositionalEncoding(\n            kwargs[\"query_dim\"],\n            dropout=0., \n            max_len=temporal_position_encoding_max_len\n        ) if (temporal_position_encoding and attention_mode == \"Temporal\") else None\n\n    def extra_repr(self):\n        return f\"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}\"\n\n    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):\n        batch_size, sequence_length, _ = hidden_states.shape\n\n        if self.attention_mode == \"Temporal\":\n            d = hidden_states.shape[1]\n            hidden_states = rearrange(hidden_states, \"(b f) d c -> (b d) f c\", f=video_length)\n            \n            if self.pos_encoder is not None:\n                hidden_states = self.pos_encoder(hidden_states)\n            \n            encoder_hidden_states = repeat(encoder_hidden_states, \"b n c -> (b d) n c\", d=d) if encoder_hidden_states is not None else encoder_hidden_states\n        else:\n            raise NotImplementedError\n\n        encoder_hidden_states = encoder_hidden_states\n\n        if self.group_norm is not None:\n            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)\n\n        query = self.to_q(hidden_states)\n        dim = query.shape[-1]\n        query = self.reshape_heads_to_batch_dim(query)\n\n        if self.added_kv_proj_dim is not None:\n            raise NotImplementedError\n\n        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states\n        key = self.to_k(encoder_hidden_states)\n        value = self.to_v(encoder_hidden_states)\n\n        key = self.reshape_heads_to_batch_dim(key)\n        value = self.reshape_heads_to_batch_dim(value)\n\n        if attention_mask is not None:\n            if attention_mask.shape[-1] != query.shape[1]:\n                target_length = query.shape[1]\n                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)\n                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)\n\n        # attention, what we cannot get enough of\n        if self._use_memory_efficient_attention_xformers:\n            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)\n            # Some versions of xformers return output in fp32, cast it back to the dtype of the input\n            hidden_states = hidden_states.to(query.dtype)\n        else:\n            if self._slice_size is None or query.shape[0] // self._slice_size == 1:\n                hidden_states = self._attention(query, key, value, attention_mask)\n            else:\n                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)\n\n        # linear proj\n        hidden_states = self.to_out[0](hidden_states)\n\n        # dropout\n        hidden_states = self.to_out[1](hidden_states)\n\n        if self.attention_mode == \"Temporal\":\n            hidden_states = rearrange(hidden_states, \"(b d) f c -> (b f) d c\", d=d)\n\n        return hidden_states\n"
  },
  {
    "path": "animatediff/models/resnet.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom einops import rearrange\n\n\nclass InflatedConv3d(nn.Conv2d):\n    def forward(self, x):\n        video_length = x.shape[2]\n\n        x = rearrange(x, \"b c f h w -> (b f) c h w\")\n        x = super().forward(x)\n        x = rearrange(x, \"(b f) c h w -> b c f h w\", f=video_length)\n\n        return x\n\n\nclass InflatedGroupNorm(nn.GroupNorm):\n    def forward(self, x):\n        video_length = x.shape[2]\n\n        x = rearrange(x, \"b c f h w -> (b f) c h w\")\n        x = super().forward(x)\n        x = rearrange(x, \"(b f) c h w -> b c f h w\", f=video_length)\n\n        return x\n\n\nclass Upsample3D(nn.Module):\n    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name=\"conv\"):\n        super().__init__()\n        self.channels = channels\n        self.out_channels = out_channels or channels\n        self.use_conv = use_conv\n        self.use_conv_transpose = use_conv_transpose\n        self.name = name\n\n        conv = None\n        if use_conv_transpose:\n            raise NotImplementedError\n        elif use_conv:\n            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)\n\n    def forward(self, hidden_states, output_size=None):\n        assert hidden_states.shape[1] == self.channels\n\n        if self.use_conv_transpose:\n            raise NotImplementedError\n\n        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16\n        dtype = hidden_states.dtype\n        if dtype == torch.bfloat16:\n            hidden_states = hidden_states.to(torch.float32)\n\n        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984\n        if hidden_states.shape[0] >= 64:\n            hidden_states = hidden_states.contiguous()\n\n        # if `output_size` is passed we force the interpolation output\n        # size and do not make use of `scale_factor=2`\n        if output_size is None:\n            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode=\"nearest\")\n        else:\n            hidden_states = F.interpolate(hidden_states, size=output_size, mode=\"nearest\")\n\n        # If the input is bfloat16, we cast back to bfloat16\n        if dtype == torch.bfloat16:\n            hidden_states = hidden_states.to(dtype)\n\n        # if self.use_conv:\n        #     if self.name == \"conv\":\n        #         hidden_states = self.conv(hidden_states)\n        #     else:\n        #         hidden_states = self.Conv2d_0(hidden_states)\n        hidden_states = self.conv(hidden_states)\n\n        return hidden_states\n\n\nclass Downsample3D(nn.Module):\n    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name=\"conv\"):\n        super().__init__()\n        self.channels = channels\n        self.out_channels = out_channels or channels\n        self.use_conv = use_conv\n        self.padding = padding\n        stride = 2\n        self.name = name\n\n        if use_conv:\n            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)\n        else:\n            raise NotImplementedError\n\n    def forward(self, hidden_states):\n        assert hidden_states.shape[1] == self.channels\n        if self.use_conv and self.padding == 0:\n            raise NotImplementedError\n\n        assert hidden_states.shape[1] == self.channels\n        hidden_states = self.conv(hidden_states)\n\n        return hidden_states\n\n\nclass ResnetBlock3D(nn.Module):\n    def __init__(\n        self,\n        *,\n        in_channels,\n        out_channels=None,\n        conv_shortcut=False,\n        dropout=0.0,\n        temb_channels=512,\n        groups=32,\n        groups_out=None,\n        pre_norm=True,\n        eps=1e-6,\n        non_linearity=\"swish\",\n        time_embedding_norm=\"default\",\n        output_scale_factor=1.0,\n        use_in_shortcut=None,\n        use_inflated_groupnorm=False,\n    ):\n        super().__init__()\n        self.pre_norm = pre_norm\n        self.pre_norm = True\n        self.in_channels = in_channels\n        out_channels = in_channels if out_channels is None else out_channels\n        self.out_channels = out_channels\n        self.use_conv_shortcut = conv_shortcut\n        self.time_embedding_norm = time_embedding_norm\n        self.output_scale_factor = output_scale_factor\n\n        if groups_out is None:\n            groups_out = groups\n\n        assert use_inflated_groupnorm != None\n        if use_inflated_groupnorm:\n            self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)\n        else:\n            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)\n\n        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)\n\n        if temb_channels is not None:\n            if self.time_embedding_norm == \"default\":\n                time_emb_proj_out_channels = out_channels\n            elif self.time_embedding_norm == \"scale_shift\":\n                time_emb_proj_out_channels = out_channels * 2\n            else:\n                raise ValueError(f\"unknown time_embedding_norm : {self.time_embedding_norm} \")\n\n            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)\n        else:\n            self.time_emb_proj = None\n\n        if use_inflated_groupnorm:\n            self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)\n        else:\n            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)\n\n        self.dropout = torch.nn.Dropout(dropout)\n        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)\n\n        if non_linearity == \"swish\":\n            self.nonlinearity = lambda x: F.silu(x)\n        elif non_linearity == \"mish\":\n            self.nonlinearity = Mish()\n        elif non_linearity == \"silu\":\n            self.nonlinearity = nn.SiLU()\n\n        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut\n\n        self.conv_shortcut = None\n        if self.use_in_shortcut:\n            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)\n\n    def forward(self, input_tensor, temb):\n        hidden_states = input_tensor\n\n        hidden_states = self.norm1(hidden_states)\n        hidden_states = self.nonlinearity(hidden_states)\n\n        hidden_states = self.conv1(hidden_states)\n\n        if temb is not None:\n            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]\n\n        if temb is not None and self.time_embedding_norm == \"default\":\n            hidden_states = hidden_states + temb\n\n        hidden_states = self.norm2(hidden_states)\n\n        if temb is not None and self.time_embedding_norm == \"scale_shift\":\n            scale, shift = torch.chunk(temb, 2, dim=1)\n            hidden_states = hidden_states * (1 + scale) + shift\n\n        hidden_states = self.nonlinearity(hidden_states)\n\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.conv2(hidden_states)\n\n        if self.conv_shortcut is not None:\n            input_tensor = self.conv_shortcut(input_tensor)\n\n        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor\n\n        return output_tensor\n\n\nclass Mish(torch.nn.Module):\n    def forward(self, hidden_states):\n        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))"
  },
  {
    "path": "animatediff/models/sparse_controlnet.py",
    "content": "# Copyright 2023 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# \n#  Changes were made to this source code by Yuwei Guo.\nfrom dataclasses import dataclass\nfrom typing import Any, Dict, List, Optional, Tuple, Union\n\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom diffusers.configuration_utils import ConfigMixin, register_to_config\nfrom diffusers.utils import BaseOutput, logging\nfrom diffusers.models.embeddings import TimestepEmbedding, Timesteps\nfrom diffusers.modeling_utils import ModelMixin\n\n\nfrom .unet_blocks import (\n    CrossAttnDownBlock3D,\n    DownBlock3D,\n    UNetMidBlock3DCrossAttn,\n    get_down_block,\n)\nfrom einops import repeat, rearrange\nfrom .resnet import InflatedConv3d\n\nfrom diffusers.models.unet_2d_condition import UNet2DConditionModel\n\nlogger = logging.get_logger(__name__)  # pylint: disable=invalid-name\n\n\n@dataclass\nclass SparseControlNetOutput(BaseOutput):\n    down_block_res_samples: Tuple[torch.Tensor]\n    mid_block_res_sample: torch.Tensor\n\n\nclass SparseControlNetConditioningEmbedding(nn.Module):\n    def __init__(\n        self,\n        conditioning_embedding_channels: int,\n        conditioning_channels: int = 3,\n        block_out_channels: Tuple[int] = (16, 32, 96, 256),\n    ):\n        super().__init__()\n\n        self.conv_in = InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)\n\n        self.blocks = nn.ModuleList([])\n\n        for i in range(len(block_out_channels) - 1):\n            channel_in = block_out_channels[i]\n            channel_out = block_out_channels[i + 1]\n            self.blocks.append(InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1))\n            self.blocks.append(InflatedConv3d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))\n\n        self.conv_out = zero_module(\n            InflatedConv3d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)\n        )\n\n    def forward(self, conditioning):\n        embedding = self.conv_in(conditioning)\n        embedding = F.silu(embedding)\n\n        for block in self.blocks:\n            embedding = block(embedding)\n            embedding = F.silu(embedding)\n\n        embedding = self.conv_out(embedding)\n\n        return embedding\n\n\nclass SparseControlNetModel(ModelMixin, ConfigMixin):\n    _supports_gradient_checkpointing = True\n\n    @register_to_config\n    def __init__(\n        self,\n        in_channels: int = 4,\n        conditioning_channels: int = 3,\n        flip_sin_to_cos: bool = True,\n        freq_shift: int = 0,\n        down_block_types: Tuple[str] = (\n            \"CrossAttnDownBlock2D\",\n            \"CrossAttnDownBlock2D\",\n            \"CrossAttnDownBlock2D\",\n            \"DownBlock2D\",\n        ),\n        only_cross_attention: Union[bool, Tuple[bool]] = False,\n        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),\n        layers_per_block: int = 2,\n        downsample_padding: int = 1,\n        mid_block_scale_factor: float = 1,\n        act_fn: str = \"silu\",\n        norm_num_groups: Optional[int] = 32,\n        norm_eps: float = 1e-5,\n        cross_attention_dim: int = 1280,\n        attention_head_dim: Union[int, Tuple[int]] = 8,\n        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,\n        use_linear_projection: bool = False,\n        class_embed_type: Optional[str] = None,\n        num_class_embeds: Optional[int] = None,\n        upcast_attention: bool = False,\n        resnet_time_scale_shift: str = \"default\",\n        projection_class_embeddings_input_dim: Optional[int] = None,\n        controlnet_conditioning_channel_order: str = \"rgb\",\n        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),\n        global_pool_conditions: bool = False,\n\n        use_motion_module         = True,\n        motion_module_resolutions = ( 1,2,4,8 ),\n        motion_module_mid_block   = False,\n        motion_module_type        = \"Vanilla\",\n        motion_module_kwargs      = {\n            \"num_attention_heads\": 8,\n            \"num_transformer_block\": 1,\n            \"attention_block_types\": [\"Temporal_Self\"],\n            \"temporal_position_encoding\": True,\n            \"temporal_position_encoding_max_len\": 32,\n            \"temporal_attention_dim_div\": 1,\n            \"causal_temporal_attention\": False,\n        },\n\n        concate_conditioning_mask: bool = True,\n        use_simplified_condition_embedding:  bool = False,\n\n        set_noisy_sample_input_to_zero: bool = False,\n    ):\n        super().__init__()\n\n        # If `num_attention_heads` is not defined (which is the case for most models)\n        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.\n        # The reason for this behavior is to correct for incorrectly named variables that were introduced\n        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131\n        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking\n        # which is why we correct for the naming here.\n        num_attention_heads = num_attention_heads or attention_head_dim\n\n        # Check inputs\n        if len(block_out_channels) != len(down_block_types):\n            raise ValueError(\n                f\"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}.\"\n            )\n\n        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):\n            raise ValueError(\n                f\"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}.\"\n            )\n\n        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):\n            raise ValueError(\n                f\"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}.\"\n            )\n\n        # input\n        self.set_noisy_sample_input_to_zero  = set_noisy_sample_input_to_zero\n\n        conv_in_kernel = 3\n        conv_in_padding = (conv_in_kernel - 1) // 2\n        self.conv_in = InflatedConv3d(\n            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding\n        )\n\n        if concate_conditioning_mask:\n            conditioning_channels = conditioning_channels + 1\n        self.concate_conditioning_mask = concate_conditioning_mask\n\n        # control net conditioning embedding\n        if use_simplified_condition_embedding:\n            self.controlnet_cond_embedding = zero_module(\n                InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding)\n            )\n        else:\n            self.controlnet_cond_embedding = SparseControlNetConditioningEmbedding(\n                conditioning_embedding_channels=block_out_channels[0],\n                block_out_channels=conditioning_embedding_out_channels,\n                conditioning_channels=conditioning_channels,\n            )\n        self.use_simplified_condition_embedding = use_simplified_condition_embedding\n\n        # time\n        time_embed_dim = block_out_channels[0] * 4\n\n        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)\n        timestep_input_dim = block_out_channels[0]\n\n        self.time_embedding = TimestepEmbedding(\n            timestep_input_dim,\n            time_embed_dim,\n            act_fn=act_fn,\n        )\n\n        # class embedding\n        if class_embed_type is None and num_class_embeds is not None:\n            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)\n        elif class_embed_type == \"timestep\":\n            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)\n        elif class_embed_type == \"identity\":\n            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)\n        elif class_embed_type == \"projection\":\n            if projection_class_embeddings_input_dim is None:\n                raise ValueError(\n                    \"`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set\"\n                )\n            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except\n            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings\n            # 2. it projects from an arbitrary input dimension.\n            #\n            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.\n            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.\n            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.\n            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)\n        else:\n            self.class_embedding = None\n\n\n        self.down_blocks = nn.ModuleList([])\n        self.controlnet_down_blocks = nn.ModuleList([])\n\n        if isinstance(only_cross_attention, bool):\n            only_cross_attention = [only_cross_attention] * len(down_block_types)\n\n        if isinstance(attention_head_dim, int):\n            attention_head_dim = (attention_head_dim,) * len(down_block_types)\n\n        if isinstance(num_attention_heads, int):\n            num_attention_heads = (num_attention_heads,) * len(down_block_types)\n\n        # down\n        output_channel = block_out_channels[0]\n\n        controlnet_block = InflatedConv3d(output_channel, output_channel, kernel_size=1)\n        controlnet_block = zero_module(controlnet_block)\n        self.controlnet_down_blocks.append(controlnet_block)\n\n        for i, down_block_type in enumerate(down_block_types):\n            res = 2 ** i\n            input_channel = output_channel\n            output_channel = block_out_channels[i]\n            is_final_block = i == len(block_out_channels) - 1\n\n            down_block = get_down_block(\n                down_block_type,\n                num_layers=layers_per_block,\n                in_channels=input_channel,\n                out_channels=output_channel,\n                temb_channels=time_embed_dim,\n                add_downsample=not is_final_block,\n                resnet_eps=norm_eps,\n                resnet_act_fn=act_fn,\n                resnet_groups=norm_num_groups,\n                cross_attention_dim=cross_attention_dim,\n                attn_num_head_channels=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,\n                downsample_padding=downsample_padding,\n                use_linear_projection=use_linear_projection,\n                only_cross_attention=only_cross_attention[i],\n                upcast_attention=upcast_attention,\n                resnet_time_scale_shift=resnet_time_scale_shift,\n\n                use_inflated_groupnorm=True,\n\n                use_motion_module=use_motion_module and (res in motion_module_resolutions),\n                motion_module_type=motion_module_type,\n                motion_module_kwargs=motion_module_kwargs,\n            )\n            self.down_blocks.append(down_block)\n\n            for _ in range(layers_per_block):\n                controlnet_block = InflatedConv3d(output_channel, output_channel, kernel_size=1)\n                controlnet_block = zero_module(controlnet_block)\n                self.controlnet_down_blocks.append(controlnet_block)\n\n            if not is_final_block:\n                controlnet_block = InflatedConv3d(output_channel, output_channel, kernel_size=1)\n                controlnet_block = zero_module(controlnet_block)\n                self.controlnet_down_blocks.append(controlnet_block)\n\n        # mid\n        mid_block_channel = block_out_channels[-1]\n\n        controlnet_block = InflatedConv3d(mid_block_channel, mid_block_channel, kernel_size=1)\n        controlnet_block = zero_module(controlnet_block)\n        self.controlnet_mid_block = controlnet_block\n\n        self.mid_block = UNetMidBlock3DCrossAttn(\n            in_channels=mid_block_channel,\n            temb_channels=time_embed_dim,\n            resnet_eps=norm_eps,\n            resnet_act_fn=act_fn,\n            output_scale_factor=mid_block_scale_factor,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n            cross_attention_dim=cross_attention_dim,\n            attn_num_head_channels=num_attention_heads[-1],\n            resnet_groups=norm_num_groups,\n            use_linear_projection=use_linear_projection,\n            upcast_attention=upcast_attention,\n\n            use_inflated_groupnorm=True,\n            use_motion_module=use_motion_module and motion_module_mid_block,\n            motion_module_type=motion_module_type,\n            motion_module_kwargs=motion_module_kwargs,\n        )\n\n    @classmethod\n    def from_unet(\n        cls,\n        unet: UNet2DConditionModel,\n        controlnet_conditioning_channel_order: str = \"rgb\",\n        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),\n        load_weights_from_unet: bool = True,\n\n        controlnet_additional_kwargs: dict = {},\n    ):\n        controlnet = cls(\n            in_channels=unet.config.in_channels,\n            flip_sin_to_cos=unet.config.flip_sin_to_cos,\n            freq_shift=unet.config.freq_shift,\n            down_block_types=unet.config.down_block_types,\n            only_cross_attention=unet.config.only_cross_attention,\n            block_out_channels=unet.config.block_out_channels,\n            layers_per_block=unet.config.layers_per_block,\n            downsample_padding=unet.config.downsample_padding,\n            mid_block_scale_factor=unet.config.mid_block_scale_factor,\n            act_fn=unet.config.act_fn,\n            norm_num_groups=unet.config.norm_num_groups,\n            norm_eps=unet.config.norm_eps,\n            cross_attention_dim=unet.config.cross_attention_dim,\n            attention_head_dim=unet.config.attention_head_dim,\n            num_attention_heads=unet.config.num_attention_heads,\n            use_linear_projection=unet.config.use_linear_projection,\n            class_embed_type=unet.config.class_embed_type,\n            num_class_embeds=unet.config.num_class_embeds,\n            upcast_attention=unet.config.upcast_attention,\n            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,\n            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,\n            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,\n            conditioning_embedding_out_channels=conditioning_embedding_out_channels,\n\n            **controlnet_additional_kwargs,\n        )\n\n        if load_weights_from_unet:\n            m, u = controlnet.conv_in.load_state_dict(cls.image_layer_filter(unet.conv_in.state_dict()), strict=False)\n            assert len(u) == 0\n            m, u = controlnet.time_proj.load_state_dict(cls.image_layer_filter(unet.time_proj.state_dict()), strict=False)\n            assert len(u) == 0\n            m, u = controlnet.time_embedding.load_state_dict(cls.image_layer_filter(unet.time_embedding.state_dict()), strict=False)\n            assert len(u) == 0\n\n            if controlnet.class_embedding:\n                m, u = controlnet.class_embedding.load_state_dict(cls.image_layer_filter(unet.class_embedding.state_dict()), strict=False)\n                assert len(u) == 0\n            m, u = controlnet.down_blocks.load_state_dict(cls.image_layer_filter(unet.down_blocks.state_dict()), strict=False)\n            assert len(u) == 0\n            m, u = controlnet.mid_block.load_state_dict(cls.image_layer_filter(unet.mid_block.state_dict()), strict=False)\n            assert len(u) == 0\n\n        return controlnet\n\n    @staticmethod\n    def image_layer_filter(state_dict):\n        new_state_dict = {}\n        for name, param in state_dict.items():\n            if \"motion_modules.\" in name or \"lora\" in name: continue\n            new_state_dict[name] = param\n        return new_state_dict\n\n    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice\n    def set_attention_slice(self, slice_size):\n        r\"\"\"\n        Enable sliced attention computation.\n\n        When this option is enabled, the attention module splits the input tensor in slices to compute attention in\n        several steps. This is useful for saving some memory in exchange for a small decrease in speed.\n\n        Args:\n            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `\"auto\"`):\n                When `\"auto\"`, input to the attention heads is halved, so attention is computed in two steps. If\n                `\"max\"`, maximum amount of memory is saved by running only one slice at a time. If a number is\n                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`\n                must be a multiple of `slice_size`.\n        \"\"\"\n        sliceable_head_dims = []\n\n        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):\n            if hasattr(module, \"set_attention_slice\"):\n                sliceable_head_dims.append(module.sliceable_head_dim)\n\n            for child in module.children():\n                fn_recursive_retrieve_sliceable_dims(child)\n\n        # retrieve number of attention layers\n        for module in self.children():\n            fn_recursive_retrieve_sliceable_dims(module)\n\n        num_sliceable_layers = len(sliceable_head_dims)\n\n        if slice_size == \"auto\":\n            # half the attention head size is usually a good trade-off between\n            # speed and memory\n            slice_size = [dim // 2 for dim in sliceable_head_dims]\n        elif slice_size == \"max\":\n            # make smallest slice possible\n            slice_size = num_sliceable_layers * [1]\n\n        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size\n\n        if len(slice_size) != len(sliceable_head_dims):\n            raise ValueError(\n                f\"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different\"\n                f\" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}.\"\n            )\n\n        for i in range(len(slice_size)):\n            size = slice_size[i]\n            dim = sliceable_head_dims[i]\n            if size is not None and size > dim:\n                raise ValueError(f\"size {size} has to be smaller or equal to {dim}.\")\n\n        # Recursively walk through all the children.\n        # Any children which exposes the set_attention_slice method\n        # gets the message\n        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):\n            if hasattr(module, \"set_attention_slice\"):\n                module.set_attention_slice(slice_size.pop())\n\n            for child in module.children():\n                fn_recursive_set_attention_slice(child, slice_size)\n\n        reversed_slice_size = list(reversed(slice_size))\n        for module in self.children():\n            fn_recursive_set_attention_slice(module, reversed_slice_size)\n\n    def _set_gradient_checkpointing(self, module, value=False):\n        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):\n            module.gradient_checkpointing = value\n\n    def forward(\n        self,\n        sample: torch.FloatTensor,\n        timestep: Union[torch.Tensor, float, int],\n        encoder_hidden_states: torch.Tensor,\n\n        controlnet_cond: torch.FloatTensor,\n        conditioning_mask: Optional[torch.FloatTensor] = None,\n\n        conditioning_scale: float = 1.0,\n        class_labels: Optional[torch.Tensor] = None,\n        attention_mask: Optional[torch.Tensor] = None,\n        cross_attention_kwargs: Optional[Dict[str, Any]] = None,\n        guess_mode: bool = False,\n        return_dict: bool = True,\n    ) -> Union[SparseControlNetOutput, Tuple]:\n\n        # set input noise to zero\n        if self.set_noisy_sample_input_to_zero:\n            sample = torch.zeros_like(sample).to(sample.device)\n\n        # prepare attention_mask\n        if attention_mask is not None:\n            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0\n            attention_mask = attention_mask.unsqueeze(1)\n\n        # 1. time\n        timesteps = timestep\n        if not torch.is_tensor(timesteps):\n            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can\n            # This would be a good case for the `match` statement (Python 3.10+)\n            is_mps = sample.device.type == \"mps\"\n            if isinstance(timestep, float):\n                dtype = torch.float32 if is_mps else torch.float64\n            else:\n                dtype = torch.int32 if is_mps else torch.int64\n            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)\n        elif len(timesteps.shape) == 0:\n            timesteps = timesteps[None].to(sample.device)\n\n        timesteps             = timesteps.repeat(sample.shape[0] // timesteps.shape[0])\n        encoder_hidden_states = encoder_hidden_states.repeat(sample.shape[0] // encoder_hidden_states.shape[0], 1, 1)\n\n        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML\n        timesteps = timesteps.expand(sample.shape[0])\n\n        t_emb = self.time_proj(timesteps)\n\n        # timesteps does not contain any weights and will always return f32 tensors\n        # but time_embedding might actually be running in fp16. so we need to cast here.\n        # there might be better ways to encapsulate this.\n        t_emb = t_emb.to(dtype=self.dtype)\n        emb = self.time_embedding(t_emb)\n\n        if self.class_embedding is not None:\n            if class_labels is None:\n                raise ValueError(\"class_labels should be provided when num_class_embeds > 0\")\n\n            if self.config.class_embed_type == \"timestep\":\n                class_labels = self.time_proj(class_labels)\n\n            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)\n            emb = emb + class_emb\n\n        # 2. pre-process\n        sample = self.conv_in(sample)\n        \n        if self.concate_conditioning_mask:\n            controlnet_cond = torch.cat([controlnet_cond, conditioning_mask], dim=1)\n        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)\n        \n        sample = sample + controlnet_cond\n\n        # 3. down\n        down_block_res_samples = (sample,)\n        for downsample_block in self.down_blocks:\n            if hasattr(downsample_block, \"has_cross_attention\") and downsample_block.has_cross_attention:\n                sample, res_samples = downsample_block(\n                    hidden_states=sample,\n                    temb=emb,\n                    encoder_hidden_states=encoder_hidden_states,\n                    attention_mask=attention_mask,\n                    # cross_attention_kwargs=cross_attention_kwargs,\n                )\n            else: sample, res_samples = downsample_block(hidden_states=sample, temb=emb)\n\n            down_block_res_samples += res_samples\n\n        # 4. mid\n        if self.mid_block is not None:\n            sample = self.mid_block(\n                sample,\n                emb,\n                encoder_hidden_states=encoder_hidden_states,\n                attention_mask=attention_mask,\n                # cross_attention_kwargs=cross_attention_kwargs,\n            )\n\n        # 5. controlnet blocks\n        controlnet_down_block_res_samples = ()\n\n        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):\n            down_block_res_sample = controlnet_block(down_block_res_sample)\n            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)\n\n        down_block_res_samples = controlnet_down_block_res_samples\n\n        mid_block_res_sample = self.controlnet_mid_block(sample)\n\n        # 6. scaling\n        if guess_mode and not self.config.global_pool_conditions:\n            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0\n\n            scales = scales * conditioning_scale\n            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]\n            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one\n        else:\n            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]\n            mid_block_res_sample = mid_block_res_sample * conditioning_scale\n\n        if self.config.global_pool_conditions:\n            down_block_res_samples = [\n                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples\n            ]\n            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)\n\n        if not return_dict:\n            return (down_block_res_samples, mid_block_res_sample)\n\n        return SparseControlNetOutput(\n            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample\n        )\n\n\ndef zero_module(module):\n    for p in module.parameters():\n        nn.init.zeros_(p)\n    return module\n"
  },
  {
    "path": "animatediff/models/unet.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py\n\nfrom dataclasses import dataclass\nfrom typing import List, Optional, Tuple, Union\n\nimport os\nimport json\nimport pdb\n\nimport torch\nimport torch.nn as nn\nimport torch.utils.checkpoint\n\nfrom diffusers.configuration_utils import ConfigMixin, register_to_config\nfrom diffusers.modeling_utils import ModelMixin\nfrom diffusers.utils import BaseOutput, logging\nfrom diffusers.models.embeddings import TimestepEmbedding, Timesteps\nfrom .unet_blocks import (\n    CrossAttnDownBlock3D,\n    CrossAttnUpBlock3D,\n    DownBlock3D,\n    UNetMidBlock3DCrossAttn,\n    UpBlock3D,\n    get_down_block,\n    get_up_block,\n)\nfrom .resnet import InflatedConv3d, InflatedGroupNorm\n\n\nlogger = logging.get_logger(__name__)  # pylint: disable=invalid-name\n\n\n@dataclass\nclass UNet3DConditionOutput(BaseOutput):\n    sample: torch.FloatTensor\n\n\nclass UNet3DConditionModel(ModelMixin, ConfigMixin):\n    _supports_gradient_checkpointing = True\n\n    @register_to_config\n    def __init__(\n        self,\n        sample_size: Optional[int] = None,\n        in_channels: int = 4,\n        out_channels: int = 4,\n        center_input_sample: bool = False,\n        flip_sin_to_cos: bool = True,\n        freq_shift: int = 0,      \n        down_block_types: Tuple[str] = (\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"DownBlock3D\",\n        ),\n        mid_block_type: str = \"UNetMidBlock3DCrossAttn\",\n        up_block_types: Tuple[str] = (\n            \"UpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\"\n        ),\n        only_cross_attention: Union[bool, Tuple[bool]] = False,\n        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),\n        layers_per_block: int = 2,\n        downsample_padding: int = 1,\n        mid_block_scale_factor: float = 1,\n        act_fn: str = \"silu\",\n        norm_num_groups: int = 32,\n        norm_eps: float = 1e-5,\n        cross_attention_dim: int = 1280,\n        attention_head_dim: Union[int, Tuple[int]] = 8,\n        dual_cross_attention: bool = False,\n        use_linear_projection: bool = False,\n        class_embed_type: Optional[str] = None,\n        num_class_embeds: Optional[int] = None,\n        upcast_attention: bool = False,\n        resnet_time_scale_shift: str = \"default\",\n        \n        use_inflated_groupnorm=False,\n        \n        # Additional\n        use_motion_module              = False,\n        motion_module_resolutions      = ( 1,2,4,8 ),\n        motion_module_mid_block        = False,\n        motion_module_decoder_only     = False,\n        motion_module_type             = None,\n        motion_module_kwargs           = {},\n        unet_use_cross_frame_attention = False,\n        unet_use_temporal_attention    = False,\n    ):\n        super().__init__()\n        \n        self.sample_size = sample_size\n        time_embed_dim = block_out_channels[0] * 4\n\n        # input\n        self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))\n\n        # time\n        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)\n        timestep_input_dim = block_out_channels[0]\n\n        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)\n\n        # class embedding\n        if class_embed_type is None and num_class_embeds is not None:\n            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)\n        elif class_embed_type == \"timestep\":\n            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)\n        elif class_embed_type == \"identity\":\n            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)\n        else:\n            self.class_embedding = None\n\n        self.down_blocks = nn.ModuleList([])\n        self.mid_block = None\n        self.up_blocks = nn.ModuleList([])\n\n        if isinstance(only_cross_attention, bool):\n            only_cross_attention = [only_cross_attention] * len(down_block_types)\n\n        if isinstance(attention_head_dim, int):\n            attention_head_dim = (attention_head_dim,) * len(down_block_types)\n\n        # down\n        output_channel = block_out_channels[0]\n        for i, down_block_type in enumerate(down_block_types):\n            res = 2 ** i\n            input_channel = output_channel\n            output_channel = block_out_channels[i]\n            is_final_block = i == len(block_out_channels) - 1\n\n            down_block = get_down_block(\n                down_block_type,\n                num_layers=layers_per_block,\n                in_channels=input_channel,\n                out_channels=output_channel,\n                temb_channels=time_embed_dim,\n                add_downsample=not is_final_block,\n                resnet_eps=norm_eps,\n                resnet_act_fn=act_fn,\n                resnet_groups=norm_num_groups,\n                cross_attention_dim=cross_attention_dim,\n                attn_num_head_channels=attention_head_dim[i],\n                downsample_padding=downsample_padding,\n                dual_cross_attention=dual_cross_attention,\n                use_linear_projection=use_linear_projection,\n                only_cross_attention=only_cross_attention[i],\n                upcast_attention=upcast_attention,\n                resnet_time_scale_shift=resnet_time_scale_shift,\n\n                unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n                unet_use_temporal_attention=unet_use_temporal_attention,\n                use_inflated_groupnorm=use_inflated_groupnorm,\n                \n                use_motion_module=use_motion_module and (res in motion_module_resolutions) and (not motion_module_decoder_only),\n                motion_module_type=motion_module_type,\n                motion_module_kwargs=motion_module_kwargs,\n            )\n            self.down_blocks.append(down_block)\n\n        # mid\n        if mid_block_type == \"UNetMidBlock3DCrossAttn\":\n            self.mid_block = UNetMidBlock3DCrossAttn(\n                in_channels=block_out_channels[-1],\n                temb_channels=time_embed_dim,\n                resnet_eps=norm_eps,\n                resnet_act_fn=act_fn,\n                output_scale_factor=mid_block_scale_factor,\n                resnet_time_scale_shift=resnet_time_scale_shift,\n                cross_attention_dim=cross_attention_dim,\n                attn_num_head_channels=attention_head_dim[-1],\n                resnet_groups=norm_num_groups,\n                dual_cross_attention=dual_cross_attention,\n                use_linear_projection=use_linear_projection,\n                upcast_attention=upcast_attention,\n\n                unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n                unet_use_temporal_attention=unet_use_temporal_attention,\n                use_inflated_groupnorm=use_inflated_groupnorm,\n                \n                use_motion_module=use_motion_module and motion_module_mid_block,\n                motion_module_type=motion_module_type,\n                motion_module_kwargs=motion_module_kwargs,\n            )\n        else:\n            raise ValueError(f\"unknown mid_block_type : {mid_block_type}\")\n        \n        # count how many layers upsample the videos\n        self.num_upsamplers = 0\n\n        # up\n        reversed_block_out_channels = list(reversed(block_out_channels))\n        reversed_attention_head_dim = list(reversed(attention_head_dim))\n        only_cross_attention = list(reversed(only_cross_attention))\n        output_channel = reversed_block_out_channels[0]\n        for i, up_block_type in enumerate(up_block_types):\n            res = 2 ** (3 - i)\n            is_final_block = i == len(block_out_channels) - 1\n\n            prev_output_channel = output_channel\n            output_channel = reversed_block_out_channels[i]\n            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]\n\n            # add upsample block for all BUT final layer\n            if not is_final_block:\n                add_upsample = True\n                self.num_upsamplers += 1\n            else:\n                add_upsample = False\n\n            up_block = get_up_block(\n                up_block_type,\n                num_layers=layers_per_block + 1,\n                in_channels=input_channel,\n                out_channels=output_channel,\n                prev_output_channel=prev_output_channel,\n                temb_channels=time_embed_dim,\n                add_upsample=add_upsample,\n                resnet_eps=norm_eps,\n                resnet_act_fn=act_fn,\n                resnet_groups=norm_num_groups,\n                cross_attention_dim=cross_attention_dim,\n                attn_num_head_channels=reversed_attention_head_dim[i],\n                dual_cross_attention=dual_cross_attention,\n                use_linear_projection=use_linear_projection,\n                only_cross_attention=only_cross_attention[i],\n                upcast_attention=upcast_attention,\n                resnet_time_scale_shift=resnet_time_scale_shift,\n\n                unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n                unet_use_temporal_attention=unet_use_temporal_attention,\n                use_inflated_groupnorm=use_inflated_groupnorm,\n\n                use_motion_module=use_motion_module and (res in motion_module_resolutions),\n                motion_module_type=motion_module_type,\n                motion_module_kwargs=motion_module_kwargs,\n            )\n            self.up_blocks.append(up_block)\n            prev_output_channel = output_channel\n\n        # out\n        if use_inflated_groupnorm:\n            self.conv_norm_out = InflatedGroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)\n        else:\n            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)\n        self.conv_act = nn.SiLU()\n        self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)\n\n    def set_attention_slice(self, slice_size):\n        r\"\"\"\n        Enable sliced attention computation.\n\n        When this option is enabled, the attention module will split the input tensor in slices, to compute attention\n        in several steps. This is useful to save some memory in exchange for a small speed decrease.\n\n        Args:\n            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `\"auto\"`):\n                When `\"auto\"`, halves the input to the attention heads, so attention will be computed in two steps. If\n                `\"max\"`, maxium amount of memory will be saved by running only one slice at a time. If a number is\n                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`\n                must be a multiple of `slice_size`.\n        \"\"\"\n        sliceable_head_dims = []\n\n        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):\n            if hasattr(module, \"set_attention_slice\"):\n                sliceable_head_dims.append(module.sliceable_head_dim)\n\n            for child in module.children():\n                fn_recursive_retrieve_slicable_dims(child)\n\n        # retrieve number of attention layers\n        for module in self.children():\n            fn_recursive_retrieve_slicable_dims(module)\n\n        num_slicable_layers = len(sliceable_head_dims)\n\n        if slice_size == \"auto\":\n            # half the attention head size is usually a good trade-off between\n            # speed and memory\n            slice_size = [dim // 2 for dim in sliceable_head_dims]\n        elif slice_size == \"max\":\n            # make smallest slice possible\n            slice_size = num_slicable_layers * [1]\n\n        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size\n\n        if len(slice_size) != len(sliceable_head_dims):\n            raise ValueError(\n                f\"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different\"\n                f\" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}.\"\n            )\n\n        for i in range(len(slice_size)):\n            size = slice_size[i]\n            dim = sliceable_head_dims[i]\n            if size is not None and size > dim:\n                raise ValueError(f\"size {size} has to be smaller or equal to {dim}.\")\n\n        # Recursively walk through all the children.\n        # Any children which exposes the set_attention_slice method\n        # gets the message\n        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):\n            if hasattr(module, \"set_attention_slice\"):\n                module.set_attention_slice(slice_size.pop())\n\n            for child in module.children():\n                fn_recursive_set_attention_slice(child, slice_size)\n\n        reversed_slice_size = list(reversed(slice_size))\n        for module in self.children():\n            fn_recursive_set_attention_slice(module, reversed_slice_size)\n\n    def _set_gradient_checkpointing(self, module, value=False):\n        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):\n            module.gradient_checkpointing = value\n\n    def forward(\n        self,\n        sample: torch.FloatTensor,\n        timestep: Union[torch.Tensor, float, int],\n        encoder_hidden_states: torch.Tensor,\n        class_labels: Optional[torch.Tensor] = None,\n        attention_mask: Optional[torch.Tensor] = None,\n\n        # support controlnet\n        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,\n        mid_block_additional_residual: Optional[torch.Tensor] = None,\n\n        return_dict: bool = True,\n    ) -> Union[UNet3DConditionOutput, Tuple]:\n        r\"\"\"\n        Args:\n            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor\n            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps\n            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states\n            return_dict (`bool`, *optional*, defaults to `True`):\n                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.\n\n        Returns:\n            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:\n            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When\n            returning a tuple, the first element is the sample tensor.\n        \"\"\"\n        # By default samples have to be AT least a multiple of the overall upsampling factor.\n        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).\n        # However, the upsampling interpolation output size can be forced to fit any upsampling size\n        # on the fly if necessary.\n        default_overall_up_factor = 2**self.num_upsamplers\n\n        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`\n        forward_upsample_size = False\n        upsample_size = None\n\n        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):\n            logger.info(\"Forward upsample size to force interpolation output size.\")\n            forward_upsample_size = True\n\n        # prepare attention_mask\n        if attention_mask is not None:\n            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0\n            attention_mask = attention_mask.unsqueeze(1)\n\n        # center input if necessary\n        if self.config.center_input_sample:\n            sample = 2 * sample - 1.0\n\n        # time\n        timesteps = timestep\n        if not torch.is_tensor(timesteps):\n            # This would be a good case for the `match` statement (Python 3.10+)\n            is_mps = sample.device.type == \"mps\"\n            if isinstance(timestep, float):\n                dtype = torch.float32 if is_mps else torch.float64\n            else:\n                dtype = torch.int32 if is_mps else torch.int64\n            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)\n        elif len(timesteps.shape) == 0:\n            timesteps = timesteps[None].to(sample.device)\n\n        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML\n        timesteps = timesteps.expand(sample.shape[0])\n\n        t_emb = self.time_proj(timesteps)\n\n        # timesteps does not contain any weights and will always return f32 tensors\n        # but time_embedding might actually be running in fp16. so we need to cast here.\n        # there might be better ways to encapsulate this.\n        t_emb = t_emb.to(dtype=self.dtype)\n        emb = self.time_embedding(t_emb)\n\n        if self.class_embedding is not None:\n            if class_labels is None:\n                raise ValueError(\"class_labels should be provided when num_class_embeds > 0\")\n\n            if self.config.class_embed_type == \"timestep\":\n                class_labels = self.time_proj(class_labels)\n\n            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)\n            emb = emb + class_emb\n\n        # pre-process\n        sample = self.conv_in(sample)\n\n        # down\n        down_block_res_samples = (sample,)\n        for downsample_block in self.down_blocks:\n            if hasattr(downsample_block, \"has_cross_attention\") and downsample_block.has_cross_attention:\n                sample, res_samples = downsample_block(\n                    hidden_states=sample,\n                    temb=emb,\n                    encoder_hidden_states=encoder_hidden_states,\n                    attention_mask=attention_mask,\n                )\n            else:\n                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states)\n\n            down_block_res_samples += res_samples\n\n        # support controlnet\n        down_block_res_samples = list(down_block_res_samples)\n        if down_block_additional_residuals is not None:\n            for i, down_block_additional_residual in enumerate(down_block_additional_residuals):\n                if down_block_additional_residual.dim() == 4: # boardcast\n                    down_block_additional_residual = down_block_additional_residual.unsqueeze(2)\n                down_block_res_samples[i] = down_block_res_samples[i] + down_block_additional_residual\n\n        # mid\n        sample = self.mid_block(\n            sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask\n        )\n\n        # support controlnet\n        if mid_block_additional_residual is not None:\n            if mid_block_additional_residual.dim() == 4: # boardcast\n                mid_block_additional_residual = mid_block_additional_residual.unsqueeze(2)\n            sample = sample + mid_block_additional_residual\n\n        # up\n        for i, upsample_block in enumerate(self.up_blocks):\n            is_final_block = i == len(self.up_blocks) - 1\n\n            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]\n            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]\n\n            # if we have not reached the final block and need to forward the\n            # upsample size, we do it here\n            if not is_final_block and forward_upsample_size:\n                upsample_size = down_block_res_samples[-1].shape[2:]\n\n            if hasattr(upsample_block, \"has_cross_attention\") and upsample_block.has_cross_attention:\n                sample = upsample_block(\n                    hidden_states=sample,\n                    temb=emb,\n                    res_hidden_states_tuple=res_samples,\n                    encoder_hidden_states=encoder_hidden_states,\n                    upsample_size=upsample_size,\n                    attention_mask=attention_mask,\n                )\n            else:\n                sample = upsample_block(\n                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size, encoder_hidden_states=encoder_hidden_states,\n                )\n\n        # post-process\n        sample = self.conv_norm_out(sample)\n        sample = self.conv_act(sample)\n        sample = self.conv_out(sample)\n\n        if not return_dict:\n            return (sample,)\n\n        return UNet3DConditionOutput(sample=sample)\n\n    @classmethod\n    def from_pretrained_2d(cls, pretrained_model_name_or_path, unet_additional_kwargs={}, **kwargs):\n        from diffusers import __version__\n        from diffusers.utils import DIFFUSERS_CACHE, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, is_safetensors_available\n        from diffusers.modeling_utils import load_state_dict\n        print(f\"loaded 3D unet's pretrained weights from {pretrained_model_name_or_path} ...\")\n\n        cache_dir = kwargs.pop(\"cache_dir\", DIFFUSERS_CACHE)\n        force_download = kwargs.pop(\"force_download\", False)\n        resume_download = kwargs.pop(\"resume_download\", False)\n        proxies = kwargs.pop(\"proxies\", None)\n        local_files_only = kwargs.pop(\"local_files_only\", False)\n        use_auth_token = kwargs.pop(\"use_auth_token\", None)\n        revision = kwargs.pop(\"revision\", None)\n        subfolder = kwargs.pop(\"subfolder\", None)\n        device_map = kwargs.pop(\"device_map\", None)\n\n        user_agent = {\n            \"diffusers\": __version__,\n            \"file_type\": \"model\",\n            \"framework\": \"pytorch\",\n        }\n\n        model_file = None\n        if is_safetensors_available():\n            try:\n                model_file = cls._get_model_file(\n                    pretrained_model_name_or_path,\n                    weights_name=SAFETENSORS_WEIGHTS_NAME,\n                    cache_dir=cache_dir,\n                    force_download=force_download,\n                    resume_download=resume_download,\n                    proxies=proxies,\n                    local_files_only=local_files_only,\n                    use_auth_token=use_auth_token,\n                    revision=revision,\n                    subfolder=subfolder,\n                    user_agent=user_agent,\n                )\n            except:\n                pass\n\n        if model_file is None:\n            model_file = cls._get_model_file(\n                pretrained_model_name_or_path,\n                weights_name=WEIGHTS_NAME,\n                cache_dir=cache_dir,\n                force_download=force_download,\n                resume_download=resume_download,\n                proxies=proxies,\n                local_files_only=local_files_only,\n                use_auth_token=use_auth_token,\n                revision=revision,\n                subfolder=subfolder,\n                user_agent=user_agent,\n            )\n\n        config, unused_kwargs = cls.load_config(\n            pretrained_model_name_or_path,\n            cache_dir=cache_dir,\n            return_unused_kwargs=True,\n            force_download=force_download,\n            resume_download=resume_download,\n            proxies=proxies,\n            local_files_only=local_files_only,\n            use_auth_token=use_auth_token,\n            revision=revision,\n            subfolder=subfolder,\n            device_map=device_map,\n            **kwargs,\n        )\n\n        config[\"_class_name\"] = cls.__name__\n        config[\"down_block_types\"] = [\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"DownBlock3D\"\n        ]\n        config[\"up_block_types\"] = [\n            \"UpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\"\n        ]\n\n        model = cls.from_config(config, **unused_kwargs, **unet_additional_kwargs)\n        state_dict = load_state_dict(model_file)\n\n        m, u = model.load_state_dict(state_dict, strict=False)\n        print(f\"### missing keys: {len(m)}; \\n### unexpected keys: {len(u)};\")\n        \n        params = [p.numel() if \"motion_modules.\" in n else 0 for n, p in model.named_parameters()]\n        print(f\"### Motion Module Parameters: {sum(params) / 1e6} M\")\n        \n        return model\n"
  },
  {
    "path": "animatediff/models/unet_blocks.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py\n\nimport torch\nfrom torch import nn\n\nfrom .attention import Transformer3DModel\nfrom .resnet import Downsample3D, ResnetBlock3D, Upsample3D\nfrom .motion_module import get_motion_module\n\nimport pdb\n\ndef get_down_block(\n    down_block_type,\n    num_layers,\n    in_channels,\n    out_channels,\n    temb_channels,\n    add_downsample,\n    resnet_eps,\n    resnet_act_fn,\n    attn_num_head_channels,\n    resnet_groups=None,\n    cross_attention_dim=None,\n    downsample_padding=None,\n    dual_cross_attention=False,\n    use_linear_projection=False,\n    only_cross_attention=False,\n    upcast_attention=False,\n    resnet_time_scale_shift=\"default\",\n    \n    unet_use_cross_frame_attention=False,\n    unet_use_temporal_attention=False,\n    use_inflated_groupnorm=False,\n\n    use_motion_module=None,\n    \n    motion_module_type=None,\n    motion_module_kwargs=None,\n):\n    down_block_type = down_block_type[7:] if down_block_type.startswith(\"UNetRes\") else down_block_type\n    if down_block_type == \"DownBlock3D\":\n        return DownBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            temb_channels=temb_channels,\n            add_downsample=add_downsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            downsample_padding=downsample_padding,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n\n            use_inflated_groupnorm=use_inflated_groupnorm,\n\n            use_motion_module=use_motion_module,\n            motion_module_type=motion_module_type,\n            motion_module_kwargs=motion_module_kwargs,\n        )\n    elif down_block_type == \"CrossAttnDownBlock3D\":\n        if cross_attention_dim is None:\n            raise ValueError(\"cross_attention_dim must be specified for CrossAttnDownBlock3D\")\n        return CrossAttnDownBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            temb_channels=temb_channels,\n            add_downsample=add_downsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            downsample_padding=downsample_padding,\n            cross_attention_dim=cross_attention_dim,\n            attn_num_head_channels=attn_num_head_channels,\n            dual_cross_attention=dual_cross_attention,\n            use_linear_projection=use_linear_projection,\n            only_cross_attention=only_cross_attention,\n            upcast_attention=upcast_attention,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n\n            unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n            unet_use_temporal_attention=unet_use_temporal_attention,\n            use_inflated_groupnorm=use_inflated_groupnorm,\n            \n            use_motion_module=use_motion_module,\n            motion_module_type=motion_module_type,\n            motion_module_kwargs=motion_module_kwargs,\n        )\n    raise ValueError(f\"{down_block_type} does not exist.\")\n\n\ndef get_up_block(\n    up_block_type,\n    num_layers,\n    in_channels,\n    out_channels,\n    prev_output_channel,\n    temb_channels,\n    add_upsample,\n    resnet_eps,\n    resnet_act_fn,\n    attn_num_head_channels,\n    resnet_groups=None,\n    cross_attention_dim=None,\n    dual_cross_attention=False,\n    use_linear_projection=False,\n    only_cross_attention=False,\n    upcast_attention=False,\n    resnet_time_scale_shift=\"default\",\n\n    unet_use_cross_frame_attention=False,\n    unet_use_temporal_attention=False,\n    use_inflated_groupnorm=False,\n    \n    use_motion_module=None,\n    motion_module_type=None,\n    motion_module_kwargs=None,\n):\n    up_block_type = up_block_type[7:] if up_block_type.startswith(\"UNetRes\") else up_block_type\n    if up_block_type == \"UpBlock3D\":\n        return UpBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            prev_output_channel=prev_output_channel,\n            temb_channels=temb_channels,\n            add_upsample=add_upsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n\n            use_inflated_groupnorm=use_inflated_groupnorm,\n\n            use_motion_module=use_motion_module,\n            motion_module_type=motion_module_type,\n            motion_module_kwargs=motion_module_kwargs,\n        )\n    elif up_block_type == \"CrossAttnUpBlock3D\":\n        if cross_attention_dim is None:\n            raise ValueError(\"cross_attention_dim must be specified for CrossAttnUpBlock3D\")\n        return CrossAttnUpBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            prev_output_channel=prev_output_channel,\n            temb_channels=temb_channels,\n            add_upsample=add_upsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            cross_attention_dim=cross_attention_dim,\n            attn_num_head_channels=attn_num_head_channels,\n            dual_cross_attention=dual_cross_attention,\n            use_linear_projection=use_linear_projection,\n            only_cross_attention=only_cross_attention,\n            upcast_attention=upcast_attention,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n\n            unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n            unet_use_temporal_attention=unet_use_temporal_attention,\n            use_inflated_groupnorm=use_inflated_groupnorm,\n\n            use_motion_module=use_motion_module,\n            motion_module_type=motion_module_type,\n            motion_module_kwargs=motion_module_kwargs,\n        )\n    raise ValueError(f\"{up_block_type} does not exist.\")\n\n\nclass UNetMidBlock3DCrossAttn(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        attn_num_head_channels=1,\n        output_scale_factor=1.0,\n        cross_attention_dim=1280,\n        dual_cross_attention=False,\n        use_linear_projection=False,\n        upcast_attention=False,\n\n        unet_use_cross_frame_attention=False,\n        unet_use_temporal_attention=False,\n        use_inflated_groupnorm=False,\n\n        use_motion_module=None,\n        \n        motion_module_type=None,\n        motion_module_kwargs=None,\n    ):\n        super().__init__()\n\n        self.has_cross_attention = True\n        self.attn_num_head_channels = attn_num_head_channels\n        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)\n\n        # there is always at least one resnet\n        resnets = [\n            ResnetBlock3D(\n                in_channels=in_channels,\n                out_channels=in_channels,\n                temb_channels=temb_channels,\n                eps=resnet_eps,\n                groups=resnet_groups,\n                dropout=dropout,\n                time_embedding_norm=resnet_time_scale_shift,\n                non_linearity=resnet_act_fn,\n                output_scale_factor=output_scale_factor,\n                pre_norm=resnet_pre_norm,\n\n                use_inflated_groupnorm=use_inflated_groupnorm,\n            )\n        ]\n        attentions = []\n        motion_modules = []\n\n        for _ in range(num_layers):\n            if dual_cross_attention:\n                raise NotImplementedError\n            attentions.append(\n                Transformer3DModel(\n                    attn_num_head_channels,\n                    in_channels // attn_num_head_channels,\n                    in_channels=in_channels,\n                    num_layers=1,\n                    cross_attention_dim=cross_attention_dim,\n                    norm_num_groups=resnet_groups,\n                    use_linear_projection=use_linear_projection,\n                    upcast_attention=upcast_attention,\n\n                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n                    unet_use_temporal_attention=unet_use_temporal_attention,\n                )\n            )\n            motion_modules.append(\n                get_motion_module(\n                    in_channels=in_channels,\n                    motion_module_type=motion_module_type, \n                    motion_module_kwargs=motion_module_kwargs,\n                ) if use_motion_module else None\n            )\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=in_channels,\n                    out_channels=in_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n\n                    use_inflated_groupnorm=use_inflated_groupnorm,\n                )\n            )\n\n        self.attentions = nn.ModuleList(attentions)\n        self.resnets = nn.ModuleList(resnets)\n        self.motion_modules = nn.ModuleList(motion_modules)\n\n    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):\n        hidden_states = self.resnets[0](hidden_states, temb)\n        for attn, resnet, motion_module in zip(self.attentions, self.resnets[1:], self.motion_modules):\n            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample\n            hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states\n            hidden_states = resnet(hidden_states, temb)\n\n        return hidden_states\n\n\nclass CrossAttnDownBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        out_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        attn_num_head_channels=1,\n        cross_attention_dim=1280,\n        output_scale_factor=1.0,\n        downsample_padding=1,\n        add_downsample=True,\n        dual_cross_attention=False,\n        use_linear_projection=False,\n        only_cross_attention=False,\n        upcast_attention=False,\n\n        unet_use_cross_frame_attention=False,\n        unet_use_temporal_attention=False,\n        use_inflated_groupnorm=False,\n        \n        use_motion_module=None,\n\n        motion_module_type=None,\n        motion_module_kwargs=None,\n    ):\n        super().__init__()\n        resnets = []\n        attentions = []\n        motion_modules = []\n\n        self.has_cross_attention = True\n        self.attn_num_head_channels = attn_num_head_channels\n\n        for i in range(num_layers):\n            in_channels = in_channels if i == 0 else out_channels\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=in_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n\n                    use_inflated_groupnorm=use_inflated_groupnorm,\n                )\n            )\n            if dual_cross_attention:\n                raise NotImplementedError\n            attentions.append(\n                Transformer3DModel(\n                    attn_num_head_channels,\n                    out_channels // attn_num_head_channels,\n                    in_channels=out_channels,\n                    num_layers=1,\n                    cross_attention_dim=cross_attention_dim,\n                    norm_num_groups=resnet_groups,\n                    use_linear_projection=use_linear_projection,\n                    only_cross_attention=only_cross_attention,\n                    upcast_attention=upcast_attention,\n\n                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n                    unet_use_temporal_attention=unet_use_temporal_attention,\n                )\n            )\n            motion_modules.append(\n                get_motion_module(\n                    in_channels=out_channels,\n                    motion_module_type=motion_module_type, \n                    motion_module_kwargs=motion_module_kwargs,\n                ) if use_motion_module else None\n            )\n            \n        self.attentions = nn.ModuleList(attentions)\n        self.resnets = nn.ModuleList(resnets)\n        self.motion_modules = nn.ModuleList(motion_modules)\n\n        if add_downsample:\n            self.downsamplers = nn.ModuleList(\n                [\n                    Downsample3D(\n                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name=\"op\"\n                    )\n                ]\n            )\n        else:\n            self.downsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):\n        output_states = ()\n\n        for resnet, attn, motion_module in zip(self.resnets, self.attentions, self.motion_modules):\n            if self.training and self.gradient_checkpointing:\n\n                def create_custom_forward(module, return_dict=None):\n                    def custom_forward(*inputs):\n                        if return_dict is not None:\n                            return module(*inputs, return_dict=return_dict)\n                        else:\n                            return module(*inputs)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n                hidden_states = torch.utils.checkpoint.checkpoint(\n                    create_custom_forward(attn, return_dict=False),\n                    hidden_states,\n                    encoder_hidden_states,\n                )[0]\n                if motion_module is not None:\n                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)\n                \n            else:\n                hidden_states = resnet(hidden_states, temb)\n                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample\n                \n                # add motion module\n                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states\n\n            output_states += (hidden_states,)\n\n        if self.downsamplers is not None:\n            for downsampler in self.downsamplers:\n                hidden_states = downsampler(hidden_states)\n\n            output_states += (hidden_states,)\n\n        return hidden_states, output_states\n\n\nclass DownBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        out_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        output_scale_factor=1.0,\n        add_downsample=True,\n        downsample_padding=1,\n\n        use_inflated_groupnorm=False,\n        \n        use_motion_module=None,\n        motion_module_type=None,\n        motion_module_kwargs=None,\n    ):\n        super().__init__()\n        resnets = []\n        motion_modules = []\n\n        for i in range(num_layers):\n            in_channels = in_channels if i == 0 else out_channels\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=in_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n\n                    use_inflated_groupnorm=use_inflated_groupnorm,\n                )\n            )\n            motion_modules.append(\n                get_motion_module(\n                    in_channels=out_channels,\n                    motion_module_type=motion_module_type, \n                    motion_module_kwargs=motion_module_kwargs,\n                ) if use_motion_module else None\n            )\n            \n        self.resnets = nn.ModuleList(resnets)\n        self.motion_modules = nn.ModuleList(motion_modules)\n\n        if add_downsample:\n            self.downsamplers = nn.ModuleList(\n                [\n                    Downsample3D(\n                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name=\"op\"\n                    )\n                ]\n            )\n        else:\n            self.downsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):\n        output_states = ()\n\n        for resnet, motion_module in zip(self.resnets, self.motion_modules):\n            if self.training and self.gradient_checkpointing:\n                def create_custom_forward(module):\n                    def custom_forward(*inputs):\n                        return module(*inputs)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n                if motion_module is not None:\n                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)\n            else:\n                hidden_states = resnet(hidden_states, temb)\n\n                # add motion module\n                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states\n\n            output_states += (hidden_states,)\n\n        if self.downsamplers is not None:\n            for downsampler in self.downsamplers:\n                hidden_states = downsampler(hidden_states)\n\n            output_states += (hidden_states,)\n\n        return hidden_states, output_states\n\n\nclass CrossAttnUpBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        out_channels: int,\n        prev_output_channel: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        attn_num_head_channels=1,\n        cross_attention_dim=1280,\n        output_scale_factor=1.0,\n        add_upsample=True,\n        dual_cross_attention=False,\n        use_linear_projection=False,\n        only_cross_attention=False,\n        upcast_attention=False,\n\n        unet_use_cross_frame_attention=False,\n        unet_use_temporal_attention=False,\n        use_inflated_groupnorm=False,\n        \n        use_motion_module=None,\n\n        motion_module_type=None,\n        motion_module_kwargs=None,\n    ):\n        super().__init__()\n        resnets = []\n        attentions = []\n        motion_modules = []\n\n        self.has_cross_attention = True\n        self.attn_num_head_channels = attn_num_head_channels\n\n        for i in range(num_layers):\n            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels\n            resnet_in_channels = prev_output_channel if i == 0 else out_channels\n\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=resnet_in_channels + res_skip_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n\n                    use_inflated_groupnorm=use_inflated_groupnorm,\n                )\n            )\n            if dual_cross_attention:\n                raise NotImplementedError\n            attentions.append(\n                Transformer3DModel(\n                    attn_num_head_channels,\n                    out_channels // attn_num_head_channels,\n                    in_channels=out_channels,\n                    num_layers=1,\n                    cross_attention_dim=cross_attention_dim,\n                    norm_num_groups=resnet_groups,\n                    use_linear_projection=use_linear_projection,\n                    only_cross_attention=only_cross_attention,\n                    upcast_attention=upcast_attention,\n\n                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,\n                    unet_use_temporal_attention=unet_use_temporal_attention,\n                )\n            )\n            motion_modules.append(\n                get_motion_module(\n                    in_channels=out_channels,\n                    motion_module_type=motion_module_type, \n                    motion_module_kwargs=motion_module_kwargs,\n                ) if use_motion_module else None\n            )\n            \n        self.attentions = nn.ModuleList(attentions)\n        self.resnets = nn.ModuleList(resnets)\n        self.motion_modules = nn.ModuleList(motion_modules)\n\n        if add_upsample:\n            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])\n        else:\n            self.upsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(\n        self,\n        hidden_states,\n        res_hidden_states_tuple,\n        temb=None,\n        encoder_hidden_states=None,\n        upsample_size=None,\n        attention_mask=None,\n    ):\n        for resnet, attn, motion_module in zip(self.resnets, self.attentions, self.motion_modules):\n            # pop res hidden states\n            res_hidden_states = res_hidden_states_tuple[-1]\n            res_hidden_states_tuple = res_hidden_states_tuple[:-1]\n            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)\n\n            if self.training and self.gradient_checkpointing:\n\n                def create_custom_forward(module, return_dict=None):\n                    def custom_forward(*inputs):\n                        if return_dict is not None:\n                            return module(*inputs, return_dict=return_dict)\n                        else:\n                            return module(*inputs)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n                hidden_states = torch.utils.checkpoint.checkpoint(\n                    create_custom_forward(attn, return_dict=False),\n                    hidden_states,\n                    encoder_hidden_states,\n                )[0]\n                if motion_module is not None:\n                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)\n            \n            else:\n                hidden_states = resnet(hidden_states, temb)\n                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample\n                \n                # add motion module\n                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states\n\n        if self.upsamplers is not None:\n            for upsampler in self.upsamplers:\n                hidden_states = upsampler(hidden_states, upsample_size)\n\n        return hidden_states\n\n\nclass UpBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        prev_output_channel: int,\n        out_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        output_scale_factor=1.0,\n        add_upsample=True,\n\n        use_inflated_groupnorm=False,\n\n        use_motion_module=None,\n        motion_module_type=None,\n        motion_module_kwargs=None,\n    ):\n        super().__init__()\n        resnets = []\n        motion_modules = []\n\n        for i in range(num_layers):\n            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels\n            resnet_in_channels = prev_output_channel if i == 0 else out_channels\n\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=resnet_in_channels + res_skip_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n\n                    use_inflated_groupnorm=use_inflated_groupnorm,\n                )\n            )\n            motion_modules.append(\n                get_motion_module(\n                    in_channels=out_channels,\n                    motion_module_type=motion_module_type, \n                    motion_module_kwargs=motion_module_kwargs,\n                ) if use_motion_module else None\n            )\n\n        self.resnets = nn.ModuleList(resnets)\n        self.motion_modules = nn.ModuleList(motion_modules)\n\n        if add_upsample:\n            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])\n        else:\n            self.upsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, encoder_hidden_states=None,):\n        for resnet, motion_module in zip(self.resnets, self.motion_modules):\n            # pop res hidden states\n            res_hidden_states = res_hidden_states_tuple[-1]\n            res_hidden_states_tuple = res_hidden_states_tuple[:-1]\n            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)\n\n            if self.training and self.gradient_checkpointing:\n                def create_custom_forward(module):\n                    def custom_forward(*inputs):\n                        return module(*inputs)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n                if motion_module is not None:\n                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)\n            else:\n                hidden_states = resnet(hidden_states, temb)\n                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states\n\n        if self.upsamplers is not None:\n            for upsampler in self.upsamplers:\n                hidden_states = upsampler(hidden_states, upsample_size)\n\n        return hidden_states\n"
  },
  {
    "path": "animatediff/pipelines/pipeline_animation.py",
    "content": "# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py\n\nimport inspect\nfrom typing import Callable, List, Optional, Union\nfrom dataclasses import dataclass\n\nimport numpy as np\nimport torch\nfrom tqdm import tqdm\n\nfrom diffusers.utils import is_accelerate_available\nfrom packaging import version\nfrom transformers import CLIPTextModel, CLIPTokenizer\n\nfrom diffusers.configuration_utils import FrozenDict\nfrom diffusers.models import AutoencoderKL\nfrom diffusers.pipeline_utils import DiffusionPipeline\nfrom diffusers.schedulers import (\n    DDIMScheduler,\n    DPMSolverMultistepScheduler,\n    EulerAncestralDiscreteScheduler,\n    EulerDiscreteScheduler,\n    LMSDiscreteScheduler,\n    PNDMScheduler,\n)\nfrom diffusers.utils import deprecate, logging, BaseOutput\n\nfrom einops import rearrange\n\nfrom ..models.unet import UNet3DConditionModel\nfrom ..models.sparse_controlnet import SparseControlNetModel\nimport pdb\n\nlogger = logging.get_logger(__name__)  # pylint: disable=invalid-name\n\n\n@dataclass\nclass AnimationPipelineOutput(BaseOutput):\n    videos: Union[torch.Tensor, np.ndarray]\n\n\nclass AnimationPipeline(DiffusionPipeline):\n    _optional_components = []\n\n    def __init__(\n        self,\n        vae: AutoencoderKL,\n        text_encoder: CLIPTextModel,\n        tokenizer: CLIPTokenizer,\n        unet: UNet3DConditionModel,\n        scheduler: Union[\n            DDIMScheduler,\n            PNDMScheduler,\n            LMSDiscreteScheduler,\n            EulerDiscreteScheduler,\n            EulerAncestralDiscreteScheduler,\n            DPMSolverMultistepScheduler,\n        ],\n        controlnet: Union[SparseControlNetModel, None] = None,\n    ):\n        super().__init__()\n\n        if hasattr(scheduler.config, \"steps_offset\") and scheduler.config.steps_offset != 1:\n            deprecation_message = (\n                f\"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`\"\n                f\" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure \"\n                \"to update the config accordingly as leaving `steps_offset` might led to incorrect results\"\n                \" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,\"\n                \" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`\"\n                \" file\"\n            )\n            deprecate(\"steps_offset!=1\", \"1.0.0\", deprecation_message, standard_warn=False)\n            new_config = dict(scheduler.config)\n            new_config[\"steps_offset\"] = 1\n            scheduler._internal_dict = FrozenDict(new_config)\n\n        if hasattr(scheduler.config, \"clip_sample\") and scheduler.config.clip_sample is True:\n            deprecation_message = (\n                f\"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`.\"\n                \" `clip_sample` should be set to False in the configuration file. Please make sure to update the\"\n                \" config accordingly as not setting `clip_sample` in the config might lead to incorrect results in\"\n                \" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very\"\n                \" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file\"\n            )\n            deprecate(\"clip_sample not set\", \"1.0.0\", deprecation_message, standard_warn=False)\n            new_config = dict(scheduler.config)\n            new_config[\"clip_sample\"] = False\n            scheduler._internal_dict = FrozenDict(new_config)\n\n        is_unet_version_less_0_9_0 = hasattr(unet.config, \"_diffusers_version\") and version.parse(\n            version.parse(unet.config._diffusers_version).base_version\n        ) < version.parse(\"0.9.0.dev0\")\n        is_unet_sample_size_less_64 = hasattr(unet.config, \"sample_size\") and unet.config.sample_size < 64\n        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:\n            deprecation_message = (\n                \"The configuration file of the unet has set the default `sample_size` to smaller than\"\n                \" 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the\"\n                \" following: \\n- CompVis/stable-diffusion-v1-4 \\n- CompVis/stable-diffusion-v1-3 \\n-\"\n                \" CompVis/stable-diffusion-v1-2 \\n- CompVis/stable-diffusion-v1-1 \\n- runwayml/stable-diffusion-v1-5\"\n                \" \\n- runwayml/stable-diffusion-inpainting \\n you should change 'sample_size' to 64 in the\"\n                \" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`\"\n                \" in the config might lead to incorrect results in future versions. If you have downloaded this\"\n                \" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for\"\n                \" the `unet/config.json` file\"\n            )\n            deprecate(\"sample_size<64\", \"1.0.0\", deprecation_message, standard_warn=False)\n            new_config = dict(unet.config)\n            new_config[\"sample_size\"] = 64\n            unet._internal_dict = FrozenDict(new_config)\n\n        self.register_modules(\n            vae=vae,\n            text_encoder=text_encoder,\n            tokenizer=tokenizer,\n            unet=unet,\n            scheduler=scheduler,\n            controlnet=controlnet,\n        )\n        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)\n\n    def enable_vae_slicing(self):\n        self.vae.enable_slicing()\n\n    def disable_vae_slicing(self):\n        self.vae.disable_slicing()\n\n    def enable_sequential_cpu_offload(self, gpu_id=0):\n        if is_accelerate_available():\n            from accelerate import cpu_offload\n        else:\n            raise ImportError(\"Please install accelerate via `pip install accelerate`\")\n\n        device = torch.device(f\"cuda:{gpu_id}\")\n\n        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:\n            if cpu_offloaded_model is not None:\n                cpu_offload(cpu_offloaded_model, device)\n\n\n    @property\n    def _execution_device(self):\n        if self.device != torch.device(\"meta\") or not hasattr(self.unet, \"_hf_hook\"):\n            return self.device\n        for module in self.unet.modules():\n            if (\n                hasattr(module, \"_hf_hook\")\n                and hasattr(module._hf_hook, \"execution_device\")\n                and module._hf_hook.execution_device is not None\n            ):\n                return torch.device(module._hf_hook.execution_device)\n        return self.device\n\n    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):\n        batch_size = len(prompt) if isinstance(prompt, list) else 1\n\n        text_inputs = self.tokenizer(\n            prompt,\n            padding=\"max_length\",\n            max_length=self.tokenizer.model_max_length,\n            truncation=True,\n            return_tensors=\"pt\",\n        )\n        text_input_ids = text_inputs.input_ids\n        untruncated_ids = self.tokenizer(prompt, padding=\"longest\", return_tensors=\"pt\").input_ids\n\n        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):\n            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])\n            logger.warning(\n                \"The following part of your input was truncated because CLIP can only handle sequences up to\"\n                f\" {self.tokenizer.model_max_length} tokens: {removed_text}\"\n            )\n\n        if hasattr(self.text_encoder.config, \"use_attention_mask\") and self.text_encoder.config.use_attention_mask:\n            attention_mask = text_inputs.attention_mask.to(device)\n        else:\n            attention_mask = None\n\n        text_embeddings = self.text_encoder(\n            text_input_ids.to(device),\n            attention_mask=attention_mask,\n        )\n        text_embeddings = text_embeddings[0]\n\n        # duplicate text embeddings for each generation per prompt, using mps friendly method\n        bs_embed, seq_len, _ = text_embeddings.shape\n        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)\n        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)\n\n        # get unconditional embeddings for classifier free guidance\n        if do_classifier_free_guidance:\n            uncond_tokens: List[str]\n            if negative_prompt is None:\n                uncond_tokens = [\"\"] * batch_size\n            elif type(prompt) is not type(negative_prompt):\n                raise TypeError(\n                    f\"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=\"\n                    f\" {type(prompt)}.\"\n                )\n            elif isinstance(negative_prompt, str):\n                uncond_tokens = [negative_prompt]\n            elif batch_size != len(negative_prompt):\n                raise ValueError(\n                    f\"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:\"\n                    f\" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches\"\n                    \" the batch size of `prompt`.\"\n                )\n            else:\n                uncond_tokens = negative_prompt\n\n            max_length = text_input_ids.shape[-1]\n            uncond_input = self.tokenizer(\n                uncond_tokens,\n                padding=\"max_length\",\n                max_length=max_length,\n                truncation=True,\n                return_tensors=\"pt\",\n            )\n\n            if hasattr(self.text_encoder.config, \"use_attention_mask\") and self.text_encoder.config.use_attention_mask:\n                attention_mask = uncond_input.attention_mask.to(device)\n            else:\n                attention_mask = None\n\n            uncond_embeddings = self.text_encoder(\n                uncond_input.input_ids.to(device),\n                attention_mask=attention_mask,\n            )\n            uncond_embeddings = uncond_embeddings[0]\n\n            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method\n            seq_len = uncond_embeddings.shape[1]\n            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)\n            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)\n\n            # For classifier free guidance, we need to do two forward passes.\n            # Here we concatenate the unconditional and text embeddings into a single batch\n            # to avoid doing two forward passes\n            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])\n\n        return text_embeddings\n\n    def decode_latents(self, latents):\n        video_length = latents.shape[2]\n        latents = 1 / 0.18215 * latents\n        latents = rearrange(latents, \"b c f h w -> (b f) c h w\")\n        # video = self.vae.decode(latents).sample\n        video = []\n        for frame_idx in tqdm(range(latents.shape[0])):\n            video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)\n        video = torch.cat(video)\n        video = rearrange(video, \"(b f) c h w -> b c f h w\", f=video_length)\n        video = (video / 2 + 0.5).clamp(0, 1)\n        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16\n        video = video.cpu().float().numpy()\n        return video\n\n    def prepare_extra_step_kwargs(self, generator, eta):\n        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature\n        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.\n        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502\n        # and should be between [0, 1]\n\n        accepts_eta = \"eta\" in set(inspect.signature(self.scheduler.step).parameters.keys())\n        extra_step_kwargs = {}\n        if accepts_eta:\n            extra_step_kwargs[\"eta\"] = eta\n\n        # check if the scheduler accepts generator\n        accepts_generator = \"generator\" in set(inspect.signature(self.scheduler.step).parameters.keys())\n        if accepts_generator:\n            extra_step_kwargs[\"generator\"] = generator\n        return extra_step_kwargs\n\n    def check_inputs(self, prompt, height, width, callback_steps):\n        if not isinstance(prompt, str) and not isinstance(prompt, list):\n            raise ValueError(f\"`prompt` has to be of type `str` or `list` but is {type(prompt)}\")\n\n        if height % 8 != 0 or width % 8 != 0:\n            raise ValueError(f\"`height` and `width` have to be divisible by 8 but are {height} and {width}.\")\n\n        if (callback_steps is None) or (\n            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)\n        ):\n            raise ValueError(\n                f\"`callback_steps` has to be a positive integer but is {callback_steps} of type\"\n                f\" {type(callback_steps)}.\"\n            )\n\n    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None):\n        shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)\n        if isinstance(generator, list) and len(generator) != batch_size:\n            raise ValueError(\n                f\"You have passed a list of generators of length {len(generator)}, but requested an effective batch\"\n                f\" size of {batch_size}. Make sure the batch size matches the length of the generators.\"\n            )\n        if latents is None:\n            rand_device = \"cpu\" if device.type == \"mps\" else device\n\n            if isinstance(generator, list):\n                shape = shape\n                # shape = (1,) + shape[1:]\n                latents = [\n                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)\n                    for i in range(batch_size)\n                ]\n                latents = torch.cat(latents, dim=0).to(device)\n            else:\n                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)\n        else:\n            if latents.shape != shape:\n                raise ValueError(f\"Unexpected latents shape, got {latents.shape}, expected {shape}\")\n            latents = latents.to(device)\n\n        # scale the initial noise by the standard deviation required by the scheduler\n        latents = latents * self.scheduler.init_noise_sigma\n        return latents\n\n    @torch.no_grad()\n    def __call__(\n        self,\n        prompt: Union[str, List[str]],\n        video_length: Optional[int],\n        height: Optional[int] = None,\n        width: Optional[int] = None,\n        num_inference_steps: int = 50,\n        guidance_scale: float = 7.5,\n        negative_prompt: Optional[Union[str, List[str]]] = None,\n        num_videos_per_prompt: Optional[int] = 1,\n        eta: float = 0.0,\n        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,\n        latents: Optional[torch.FloatTensor] = None,\n        output_type: Optional[str] = \"tensor\",\n        return_dict: bool = True,\n        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,\n        callback_steps: Optional[int] = 1,\n\n        # support controlnet\n        controlnet_images: torch.FloatTensor = None,\n        controlnet_image_index: list = [0],\n        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,\n\n        **kwargs,\n    ):\n        # Default height and width to unet\n        height = height or self.unet.config.sample_size * self.vae_scale_factor\n        width = width or self.unet.config.sample_size * self.vae_scale_factor\n\n        # Check inputs. Raise error if not correct\n        self.check_inputs(prompt, height, width, callback_steps)\n\n        # Define call parameters\n        # batch_size = 1 if isinstance(prompt, str) else len(prompt)\n        batch_size = 1\n        if latents is not None:\n            batch_size = latents.shape[0]\n        if isinstance(prompt, list):\n            batch_size = len(prompt)\n\n        device = self._execution_device\n        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)\n        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`\n        # corresponds to doing no classifier free guidance.\n        do_classifier_free_guidance = guidance_scale > 1.0\n\n        # Encode input prompt\n        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size\n        if negative_prompt is not None:\n            negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size \n        text_embeddings = self._encode_prompt(\n            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt\n        )\n\n        # Prepare timesteps\n        self.scheduler.set_timesteps(num_inference_steps, device=device)\n        timesteps = self.scheduler.timesteps\n\n        # Prepare latent variables\n        num_channels_latents = self.unet.in_channels\n        latents = self.prepare_latents(\n            batch_size * num_videos_per_prompt,\n            num_channels_latents,\n            video_length,\n            height,\n            width,\n            text_embeddings.dtype,\n            device,\n            generator,\n            latents,\n        )\n        latents_dtype = latents.dtype\n\n        # Prepare extra step kwargs.\n        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)\n\n        # Denoising loop\n        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order\n        with self.progress_bar(total=num_inference_steps) as progress_bar:\n            for i, t in enumerate(timesteps):\n                # expand the latents if we are doing classifier free guidance\n                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents\n                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)\n\n                down_block_additional_residuals = mid_block_additional_residual = None\n                if (getattr(self, \"controlnet\", None) != None) and (controlnet_images != None):\n                    assert controlnet_images.dim() == 5\n\n                    controlnet_noisy_latents = latent_model_input\n                    controlnet_prompt_embeds = text_embeddings\n\n                    controlnet_images = controlnet_images.to(latents.device)\n\n                    controlnet_cond_shape    = list(controlnet_images.shape)\n                    controlnet_cond_shape[2] = video_length\n                    controlnet_cond = torch.zeros(controlnet_cond_shape).to(latents.device)\n\n                    controlnet_conditioning_mask_shape    = list(controlnet_cond.shape)\n                    controlnet_conditioning_mask_shape[1] = 1\n                    controlnet_conditioning_mask          = torch.zeros(controlnet_conditioning_mask_shape).to(latents.device)\n\n                    assert controlnet_images.shape[2] >= len(controlnet_image_index)\n                    controlnet_cond[:,:,controlnet_image_index] = controlnet_images[:,:,:len(controlnet_image_index)]\n                    controlnet_conditioning_mask[:,:,controlnet_image_index] = 1\n\n                    down_block_additional_residuals, mid_block_additional_residual = self.controlnet(\n                        controlnet_noisy_latents, t,\n                        encoder_hidden_states=controlnet_prompt_embeds,\n                        controlnet_cond=controlnet_cond,\n                        conditioning_mask=controlnet_conditioning_mask,\n                        conditioning_scale=controlnet_conditioning_scale,\n                        guess_mode=False, return_dict=False,\n                    )\n\n                # predict the noise residual\n                noise_pred = self.unet(\n                    latent_model_input, t, \n                    encoder_hidden_states=text_embeddings,\n                    down_block_additional_residuals = down_block_additional_residuals,\n                    mid_block_additional_residual   = mid_block_additional_residual,\n                ).sample.to(dtype=latents_dtype)\n\n                # perform guidance\n                if do_classifier_free_guidance:\n                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)\n                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)\n\n                # compute the previous noisy sample x_t -> x_t-1\n                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample\n\n                # call the callback, if provided\n                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):\n                    progress_bar.update()\n                    if callback is not None and i % callback_steps == 0:\n                        callback(i, t, latents)\n\n        # Post-processing\n        video = self.decode_latents(latents)\n\n        # Convert to tensor\n        if output_type == \"tensor\":\n            video = torch.from_numpy(video)\n\n        if not return_dict:\n            return video\n\n        return AnimationPipelineOutput(videos=video)\n"
  },
  {
    "path": "animatediff/utils/convert_from_ckpt.py",
    "content": "# coding=utf-8\n# Copyright 2023 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" Conversion script for the Stable Diffusion checkpoints.\"\"\"\n\nimport re\nfrom io import BytesIO\nfrom typing import Optional\n\nimport requests\nimport torch\nfrom transformers import (\n    AutoFeatureExtractor,\n    BertTokenizerFast,\n    CLIPImageProcessor,\n    CLIPTextModel,\n    CLIPTextModelWithProjection,\n    CLIPTokenizer,\n    CLIPVisionConfig,\n    CLIPVisionModelWithProjection,\n)\n\nfrom diffusers.models import (\n    AutoencoderKL,\n    PriorTransformer,\n    UNet2DConditionModel,\n)\nfrom diffusers.schedulers import (\n    DDIMScheduler,\n    DDPMScheduler,\n    DPMSolverMultistepScheduler,\n    EulerAncestralDiscreteScheduler,\n    EulerDiscreteScheduler,\n    HeunDiscreteScheduler,\n    LMSDiscreteScheduler,\n    PNDMScheduler,\n    UnCLIPScheduler,\n)\nfrom diffusers.utils.import_utils import BACKENDS_MAPPING\n\n\ndef shave_segments(path, n_shave_prefix_segments=1):\n    \"\"\"\n    Removes segments. Positive values shave the first segments, negative shave the last segments.\n    \"\"\"\n    if n_shave_prefix_segments >= 0:\n        return \".\".join(path.split(\".\")[n_shave_prefix_segments:])\n    else:\n        return \".\".join(path.split(\".\")[:n_shave_prefix_segments])\n\n\ndef renew_resnet_paths(old_list, n_shave_prefix_segments=0):\n    \"\"\"\n    Updates paths inside resnets to the new naming scheme (local renaming)\n    \"\"\"\n    mapping = []\n    for old_item in old_list:\n        new_item = old_item.replace(\"in_layers.0\", \"norm1\")\n        new_item = new_item.replace(\"in_layers.2\", \"conv1\")\n\n        new_item = new_item.replace(\"out_layers.0\", \"norm2\")\n        new_item = new_item.replace(\"out_layers.3\", \"conv2\")\n\n        new_item = new_item.replace(\"emb_layers.1\", \"time_emb_proj\")\n        new_item = new_item.replace(\"skip_connection\", \"conv_shortcut\")\n\n        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)\n\n        mapping.append({\"old\": old_item, \"new\": new_item})\n\n    return mapping\n\n\ndef renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):\n    \"\"\"\n    Updates paths inside resnets to the new naming scheme (local renaming)\n    \"\"\"\n    mapping = []\n    for old_item in old_list:\n        new_item = old_item\n\n        new_item = new_item.replace(\"nin_shortcut\", \"conv_shortcut\")\n        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)\n\n        mapping.append({\"old\": old_item, \"new\": new_item})\n\n    return mapping\n\n\ndef renew_attention_paths(old_list, n_shave_prefix_segments=0):\n    \"\"\"\n    Updates paths inside attentions to the new naming scheme (local renaming)\n    \"\"\"\n    mapping = []\n    for old_item in old_list:\n        new_item = old_item\n\n        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')\n        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')\n\n        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')\n        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')\n\n        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)\n\n        mapping.append({\"old\": old_item, \"new\": new_item})\n\n    return mapping\n\n\ndef renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):\n    \"\"\"\n    Updates paths inside attentions to the new naming scheme (local renaming)\n    \"\"\"\n    mapping = []\n    for old_item in old_list:\n        new_item = old_item\n\n        new_item = new_item.replace(\"norm.weight\", \"group_norm.weight\")\n        new_item = new_item.replace(\"norm.bias\", \"group_norm.bias\")\n\n        new_item = new_item.replace(\"q.weight\", \"query.weight\")\n        new_item = new_item.replace(\"q.bias\", \"query.bias\")\n\n        new_item = new_item.replace(\"k.weight\", \"key.weight\")\n        new_item = new_item.replace(\"k.bias\", \"key.bias\")\n\n        new_item = new_item.replace(\"v.weight\", \"value.weight\")\n        new_item = new_item.replace(\"v.bias\", \"value.bias\")\n\n        new_item = new_item.replace(\"proj_out.weight\", \"proj_attn.weight\")\n        new_item = new_item.replace(\"proj_out.bias\", \"proj_attn.bias\")\n\n        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)\n\n        mapping.append({\"old\": old_item, \"new\": new_item})\n\n    return mapping\n\n\ndef assign_to_checkpoint(\n    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None\n):\n    \"\"\"\n    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits\n    attention layers, and takes into account additional replacements that may arise.\n\n    Assigns the weights to the new checkpoint.\n    \"\"\"\n    assert isinstance(paths, list), \"Paths should be a list of dicts containing 'old' and 'new' keys.\"\n\n    # Splits the attention layers into three variables.\n    if attention_paths_to_split is not None:\n        for path, path_map in attention_paths_to_split.items():\n            old_tensor = old_checkpoint[path]\n            channels = old_tensor.shape[0] // 3\n\n            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)\n\n            num_heads = old_tensor.shape[0] // config[\"num_head_channels\"] // 3\n\n            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])\n            query, key, value = old_tensor.split(channels // num_heads, dim=1)\n\n            checkpoint[path_map[\"query\"]] = query.reshape(target_shape)\n            checkpoint[path_map[\"key\"]] = key.reshape(target_shape)\n            checkpoint[path_map[\"value\"]] = value.reshape(target_shape)\n\n    for path in paths:\n        new_path = path[\"new\"]\n\n        # These have already been assigned\n        if attention_paths_to_split is not None and new_path in attention_paths_to_split:\n            continue\n\n        # Global renaming happens here\n        new_path = new_path.replace(\"middle_block.0\", \"mid_block.resnets.0\")\n        new_path = new_path.replace(\"middle_block.1\", \"mid_block.attentions.0\")\n        new_path = new_path.replace(\"middle_block.2\", \"mid_block.resnets.1\")\n\n        if additional_replacements is not None:\n            for replacement in additional_replacements:\n                new_path = new_path.replace(replacement[\"old\"], replacement[\"new\"])\n\n        # proj_attn.weight has to be converted from conv 1D to linear\n        if \"proj_attn.weight\" in new_path:\n            checkpoint[new_path] = old_checkpoint[path[\"old\"]][:, :, 0]\n        else:\n            checkpoint[new_path] = old_checkpoint[path[\"old\"]]\n\n\ndef conv_attn_to_linear(checkpoint):\n    keys = list(checkpoint.keys())\n    attn_keys = [\"query.weight\", \"key.weight\", \"value.weight\"]\n    for key in keys:\n        if \".\".join(key.split(\".\")[-2:]) in attn_keys:\n            if checkpoint[key].ndim > 2:\n                checkpoint[key] = checkpoint[key][:, :, 0, 0]\n        elif \"proj_attn.weight\" in key:\n            if checkpoint[key].ndim > 2:\n                checkpoint[key] = checkpoint[key][:, :, 0]\n\n\ndef create_unet_diffusers_config(original_config, image_size: int, controlnet=False):\n    \"\"\"\n    Creates a config for the diffusers based on the config of the LDM model.\n    \"\"\"\n    if controlnet:\n        unet_params = original_config.model.params.control_stage_config.params\n    else:\n        unet_params = original_config.model.params.unet_config.params\n\n    vae_params = original_config.model.params.first_stage_config.params.ddconfig\n\n    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]\n\n    down_block_types = []\n    resolution = 1\n    for i in range(len(block_out_channels)):\n        block_type = \"CrossAttnDownBlock2D\" if resolution in unet_params.attention_resolutions else \"DownBlock2D\"\n        down_block_types.append(block_type)\n        if i != len(block_out_channels) - 1:\n            resolution *= 2\n\n    up_block_types = []\n    for i in range(len(block_out_channels)):\n        block_type = \"CrossAttnUpBlock2D\" if resolution in unet_params.attention_resolutions else \"UpBlock2D\"\n        up_block_types.append(block_type)\n        resolution //= 2\n\n    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)\n\n    head_dim = unet_params.num_heads if \"num_heads\" in unet_params else None\n    use_linear_projection = (\n        unet_params.use_linear_in_transformer if \"use_linear_in_transformer\" in unet_params else False\n    )\n    if use_linear_projection:\n        # stable diffusion 2-base-512 and 2-768\n        if head_dim is None:\n            head_dim = [5, 10, 20, 20]\n\n    class_embed_type = None\n    projection_class_embeddings_input_dim = None\n\n    if \"num_classes\" in unet_params:\n        if unet_params.num_classes == \"sequential\":\n            class_embed_type = \"projection\"\n            assert \"adm_in_channels\" in unet_params\n            projection_class_embeddings_input_dim = unet_params.adm_in_channels\n        else:\n            raise NotImplementedError(f\"Unknown conditional unet num_classes config: {unet_params.num_classes}\")\n\n    config = {\n        \"sample_size\": image_size // vae_scale_factor,\n        \"in_channels\": unet_params.in_channels,\n        \"down_block_types\": tuple(down_block_types),\n        \"block_out_channels\": tuple(block_out_channels),\n        \"layers_per_block\": unet_params.num_res_blocks,\n        \"cross_attention_dim\": unet_params.context_dim,\n        \"attention_head_dim\": head_dim,\n        \"use_linear_projection\": use_linear_projection,\n        \"class_embed_type\": class_embed_type,\n        \"projection_class_embeddings_input_dim\": projection_class_embeddings_input_dim,\n    }\n\n    if not controlnet:\n        config[\"out_channels\"] = unet_params.out_channels\n        config[\"up_block_types\"] = tuple(up_block_types)\n\n    return config\n\n\ndef create_vae_diffusers_config(original_config, image_size: int):\n    \"\"\"\n    Creates a config for the diffusers based on the config of the LDM model.\n    \"\"\"\n    vae_params = original_config.model.params.first_stage_config.params.ddconfig\n    _ = original_config.model.params.first_stage_config.params.embed_dim\n\n    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]\n    down_block_types = [\"DownEncoderBlock2D\"] * len(block_out_channels)\n    up_block_types = [\"UpDecoderBlock2D\"] * len(block_out_channels)\n\n    config = {\n        \"sample_size\": image_size,\n        \"in_channels\": vae_params.in_channels,\n        \"out_channels\": vae_params.out_ch,\n        \"down_block_types\": tuple(down_block_types),\n        \"up_block_types\": tuple(up_block_types),\n        \"block_out_channels\": tuple(block_out_channels),\n        \"latent_channels\": vae_params.z_channels,\n        \"layers_per_block\": vae_params.num_res_blocks,\n    }\n    return config\n\n\ndef create_diffusers_schedular(original_config):\n    schedular = DDIMScheduler(\n        num_train_timesteps=original_config.model.params.timesteps,\n        beta_start=original_config.model.params.linear_start,\n        beta_end=original_config.model.params.linear_end,\n        beta_schedule=\"scaled_linear\",\n    )\n    return schedular\n\n\ndef create_ldm_bert_config(original_config):\n    bert_params = original_config.model.parms.cond_stage_config.params\n    config = LDMBertConfig(\n        d_model=bert_params.n_embed,\n        encoder_layers=bert_params.n_layer,\n        encoder_ffn_dim=bert_params.n_embed * 4,\n    )\n    return config\n\n\ndef convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):\n    \"\"\"\n    Takes a state dict and a config, and returns a converted checkpoint.\n    \"\"\"\n\n    # extract state_dict for UNet\n    unet_state_dict = {}\n    keys = list(checkpoint.keys())\n\n    if controlnet:\n        unet_key = \"control_model.\"\n    else:\n        unet_key = \"model.diffusion_model.\"\n\n    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA\n    if sum(k.startswith(\"model_ema\") for k in keys) > 100 and extract_ema:\n        print(f\"Checkpoint {path} has both EMA and non-EMA weights.\")\n        print(\n            \"In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA\"\n            \" weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag.\"\n        )\n        for key in keys:\n            if key.startswith(\"model.diffusion_model\"):\n                flat_ema_key = \"model_ema.\" + \"\".join(key.split(\".\")[1:])\n                unet_state_dict[key.replace(unet_key, \"\")] = checkpoint.pop(flat_ema_key)\n    else:\n        if sum(k.startswith(\"model_ema\") for k in keys) > 100:\n            print(\n                \"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA\"\n                \" weights (usually better for inference), please make sure to add the `--extract_ema` flag.\"\n            )\n\n        for key in keys:\n            if key.startswith(unet_key):\n                unet_state_dict[key.replace(unet_key, \"\")] = checkpoint.pop(key)\n\n    new_checkpoint = {}\n\n    new_checkpoint[\"time_embedding.linear_1.weight\"] = unet_state_dict[\"time_embed.0.weight\"]\n    new_checkpoint[\"time_embedding.linear_1.bias\"] = unet_state_dict[\"time_embed.0.bias\"]\n    new_checkpoint[\"time_embedding.linear_2.weight\"] = unet_state_dict[\"time_embed.2.weight\"]\n    new_checkpoint[\"time_embedding.linear_2.bias\"] = unet_state_dict[\"time_embed.2.bias\"]\n\n    if config[\"class_embed_type\"] is None:\n        # No parameters to port\n        ...\n    elif config[\"class_embed_type\"] == \"timestep\" or config[\"class_embed_type\"] == \"projection\":\n        new_checkpoint[\"class_embedding.linear_1.weight\"] = unet_state_dict[\"label_emb.0.0.weight\"]\n        new_checkpoint[\"class_embedding.linear_1.bias\"] = unet_state_dict[\"label_emb.0.0.bias\"]\n        new_checkpoint[\"class_embedding.linear_2.weight\"] = unet_state_dict[\"label_emb.0.2.weight\"]\n        new_checkpoint[\"class_embedding.linear_2.bias\"] = unet_state_dict[\"label_emb.0.2.bias\"]\n    else:\n        raise NotImplementedError(f\"Not implemented `class_embed_type`: {config['class_embed_type']}\")\n\n    new_checkpoint[\"conv_in.weight\"] = unet_state_dict[\"input_blocks.0.0.weight\"]\n    new_checkpoint[\"conv_in.bias\"] = unet_state_dict[\"input_blocks.0.0.bias\"]\n\n    if not controlnet:\n        new_checkpoint[\"conv_norm_out.weight\"] = unet_state_dict[\"out.0.weight\"]\n        new_checkpoint[\"conv_norm_out.bias\"] = unet_state_dict[\"out.0.bias\"]\n        new_checkpoint[\"conv_out.weight\"] = unet_state_dict[\"out.2.weight\"]\n        new_checkpoint[\"conv_out.bias\"] = unet_state_dict[\"out.2.bias\"]\n\n    # Retrieves the keys for the input blocks only\n    num_input_blocks = len({\".\".join(layer.split(\".\")[:2]) for layer in unet_state_dict if \"input_blocks\" in layer})\n    input_blocks = {\n        layer_id: [key for key in unet_state_dict if f\"input_blocks.{layer_id}\" in key]\n        for layer_id in range(num_input_blocks)\n    }\n\n    # Retrieves the keys for the middle blocks only\n    num_middle_blocks = len({\".\".join(layer.split(\".\")[:2]) for layer in unet_state_dict if \"middle_block\" in layer})\n    middle_blocks = {\n        layer_id: [key for key in unet_state_dict if f\"middle_block.{layer_id}\" in key]\n        for layer_id in range(num_middle_blocks)\n    }\n\n    # Retrieves the keys for the output blocks only\n    num_output_blocks = len({\".\".join(layer.split(\".\")[:2]) for layer in unet_state_dict if \"output_blocks\" in layer})\n    output_blocks = {\n        layer_id: [key for key in unet_state_dict if f\"output_blocks.{layer_id}\" in key]\n        for layer_id in range(num_output_blocks)\n    }\n\n    for i in range(1, num_input_blocks):\n        block_id = (i - 1) // (config[\"layers_per_block\"] + 1)\n        layer_in_block_id = (i - 1) % (config[\"layers_per_block\"] + 1)\n\n        resnets = [\n            key for key in input_blocks[i] if f\"input_blocks.{i}.0\" in key and f\"input_blocks.{i}.0.op\" not in key\n        ]\n        attentions = [key for key in input_blocks[i] if f\"input_blocks.{i}.1\" in key]\n\n        if f\"input_blocks.{i}.0.op.weight\" in unet_state_dict:\n            new_checkpoint[f\"down_blocks.{block_id}.downsamplers.0.conv.weight\"] = unet_state_dict.pop(\n                f\"input_blocks.{i}.0.op.weight\"\n            )\n            new_checkpoint[f\"down_blocks.{block_id}.downsamplers.0.conv.bias\"] = unet_state_dict.pop(\n                f\"input_blocks.{i}.0.op.bias\"\n            )\n\n        paths = renew_resnet_paths(resnets)\n        meta_path = {\"old\": f\"input_blocks.{i}.0\", \"new\": f\"down_blocks.{block_id}.resnets.{layer_in_block_id}\"}\n        assign_to_checkpoint(\n            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config\n        )\n\n        if len(attentions):\n            paths = renew_attention_paths(attentions)\n            meta_path = {\"old\": f\"input_blocks.{i}.1\", \"new\": f\"down_blocks.{block_id}.attentions.{layer_in_block_id}\"}\n            assign_to_checkpoint(\n                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config\n            )\n\n    resnet_0 = middle_blocks[0]\n    attentions = middle_blocks[1]\n    resnet_1 = middle_blocks[2]\n\n    resnet_0_paths = renew_resnet_paths(resnet_0)\n    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)\n\n    resnet_1_paths = renew_resnet_paths(resnet_1)\n    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)\n\n    attentions_paths = renew_attention_paths(attentions)\n    meta_path = {\"old\": \"middle_block.1\", \"new\": \"mid_block.attentions.0\"}\n    assign_to_checkpoint(\n        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config\n    )\n\n    for i in range(num_output_blocks):\n        block_id = i // (config[\"layers_per_block\"] + 1)\n        layer_in_block_id = i % (config[\"layers_per_block\"] + 1)\n        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]\n        output_block_list = {}\n\n        for layer in output_block_layers:\n            layer_id, layer_name = layer.split(\".\")[0], shave_segments(layer, 1)\n            if layer_id in output_block_list:\n                output_block_list[layer_id].append(layer_name)\n            else:\n                output_block_list[layer_id] = [layer_name]\n\n        if len(output_block_list) > 1:\n            resnets = [key for key in output_blocks[i] if f\"output_blocks.{i}.0\" in key]\n            attentions = [key for key in output_blocks[i] if f\"output_blocks.{i}.1\" in key]\n\n            resnet_0_paths = renew_resnet_paths(resnets)\n            paths = renew_resnet_paths(resnets)\n\n            meta_path = {\"old\": f\"output_blocks.{i}.0\", \"new\": f\"up_blocks.{block_id}.resnets.{layer_in_block_id}\"}\n            assign_to_checkpoint(\n                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config\n            )\n\n            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}\n            if [\"conv.bias\", \"conv.weight\"] in output_block_list.values():\n                index = list(output_block_list.values()).index([\"conv.bias\", \"conv.weight\"])\n                new_checkpoint[f\"up_blocks.{block_id}.upsamplers.0.conv.weight\"] = unet_state_dict[\n                    f\"output_blocks.{i}.{index}.conv.weight\"\n                ]\n                new_checkpoint[f\"up_blocks.{block_id}.upsamplers.0.conv.bias\"] = unet_state_dict[\n                    f\"output_blocks.{i}.{index}.conv.bias\"\n                ]\n\n                # Clear attentions as they have been attributed above.\n                if len(attentions) == 2:\n                    attentions = []\n\n            if len(attentions):\n                paths = renew_attention_paths(attentions)\n                meta_path = {\n                    \"old\": f\"output_blocks.{i}.1\",\n                    \"new\": f\"up_blocks.{block_id}.attentions.{layer_in_block_id}\",\n                }\n                assign_to_checkpoint(\n                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config\n                )\n        else:\n            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)\n            for path in resnet_0_paths:\n                old_path = \".\".join([\"output_blocks\", str(i), path[\"old\"]])\n                new_path = \".\".join([\"up_blocks\", str(block_id), \"resnets\", str(layer_in_block_id), path[\"new\"]])\n\n                new_checkpoint[new_path] = unet_state_dict[old_path]\n\n    if controlnet:\n        # conditioning embedding\n\n        orig_index = 0\n\n        new_checkpoint[\"controlnet_cond_embedding.conv_in.weight\"] = unet_state_dict.pop(\n            f\"input_hint_block.{orig_index}.weight\"\n        )\n        new_checkpoint[\"controlnet_cond_embedding.conv_in.bias\"] = unet_state_dict.pop(\n            f\"input_hint_block.{orig_index}.bias\"\n        )\n\n        orig_index += 2\n\n        diffusers_index = 0\n\n        while diffusers_index < 6:\n            new_checkpoint[f\"controlnet_cond_embedding.blocks.{diffusers_index}.weight\"] = unet_state_dict.pop(\n                f\"input_hint_block.{orig_index}.weight\"\n            )\n            new_checkpoint[f\"controlnet_cond_embedding.blocks.{diffusers_index}.bias\"] = unet_state_dict.pop(\n                f\"input_hint_block.{orig_index}.bias\"\n            )\n            diffusers_index += 1\n            orig_index += 2\n\n        new_checkpoint[\"controlnet_cond_embedding.conv_out.weight\"] = unet_state_dict.pop(\n            f\"input_hint_block.{orig_index}.weight\"\n        )\n        new_checkpoint[\"controlnet_cond_embedding.conv_out.bias\"] = unet_state_dict.pop(\n            f\"input_hint_block.{orig_index}.bias\"\n        )\n\n        # down blocks\n        for i in range(num_input_blocks):\n            new_checkpoint[f\"controlnet_down_blocks.{i}.weight\"] = unet_state_dict.pop(f\"zero_convs.{i}.0.weight\")\n            new_checkpoint[f\"controlnet_down_blocks.{i}.bias\"] = unet_state_dict.pop(f\"zero_convs.{i}.0.bias\")\n\n        # mid block\n        new_checkpoint[\"controlnet_mid_block.weight\"] = unet_state_dict.pop(\"middle_block_out.0.weight\")\n        new_checkpoint[\"controlnet_mid_block.bias\"] = unet_state_dict.pop(\"middle_block_out.0.bias\")\n\n    return new_checkpoint\n\n\ndef convert_ldm_vae_checkpoint(checkpoint, config):\n    # extract state dict for VAE\n    vae_state_dict = {}\n    vae_key = \"first_stage_model.\"\n    keys = list(checkpoint.keys())\n    for key in keys:\n        if key.startswith(vae_key):\n            vae_state_dict[key.replace(vae_key, \"\")] = checkpoint.get(key)\n\n    new_checkpoint = {}\n\n    new_checkpoint[\"encoder.conv_in.weight\"] = vae_state_dict[\"encoder.conv_in.weight\"]\n    new_checkpoint[\"encoder.conv_in.bias\"] = vae_state_dict[\"encoder.conv_in.bias\"]\n    new_checkpoint[\"encoder.conv_out.weight\"] = vae_state_dict[\"encoder.conv_out.weight\"]\n    new_checkpoint[\"encoder.conv_out.bias\"] = vae_state_dict[\"encoder.conv_out.bias\"]\n    new_checkpoint[\"encoder.conv_norm_out.weight\"] = vae_state_dict[\"encoder.norm_out.weight\"]\n    new_checkpoint[\"encoder.conv_norm_out.bias\"] = vae_state_dict[\"encoder.norm_out.bias\"]\n\n    new_checkpoint[\"decoder.conv_in.weight\"] = vae_state_dict[\"decoder.conv_in.weight\"]\n    new_checkpoint[\"decoder.conv_in.bias\"] = vae_state_dict[\"decoder.conv_in.bias\"]\n    new_checkpoint[\"decoder.conv_out.weight\"] = vae_state_dict[\"decoder.conv_out.weight\"]\n    new_checkpoint[\"decoder.conv_out.bias\"] = vae_state_dict[\"decoder.conv_out.bias\"]\n    new_checkpoint[\"decoder.conv_norm_out.weight\"] = vae_state_dict[\"decoder.norm_out.weight\"]\n    new_checkpoint[\"decoder.conv_norm_out.bias\"] = vae_state_dict[\"decoder.norm_out.bias\"]\n\n    new_checkpoint[\"quant_conv.weight\"] = vae_state_dict[\"quant_conv.weight\"]\n    new_checkpoint[\"quant_conv.bias\"] = vae_state_dict[\"quant_conv.bias\"]\n    new_checkpoint[\"post_quant_conv.weight\"] = vae_state_dict[\"post_quant_conv.weight\"]\n    new_checkpoint[\"post_quant_conv.bias\"] = vae_state_dict[\"post_quant_conv.bias\"]\n\n    # Retrieves the keys for the encoder down blocks only\n    num_down_blocks = len({\".\".join(layer.split(\".\")[:3]) for layer in vae_state_dict if \"encoder.down\" in layer})\n    down_blocks = {\n        layer_id: [key for key in vae_state_dict if f\"down.{layer_id}\" in key] for layer_id in range(num_down_blocks)\n    }\n\n    # Retrieves the keys for the decoder up blocks only\n    num_up_blocks = len({\".\".join(layer.split(\".\")[:3]) for layer in vae_state_dict if \"decoder.up\" in layer})\n    up_blocks = {\n        layer_id: [key for key in vae_state_dict if f\"up.{layer_id}\" in key] for layer_id in range(num_up_blocks)\n    }\n\n    for i in range(num_down_blocks):\n        resnets = [key for key in down_blocks[i] if f\"down.{i}\" in key and f\"down.{i}.downsample\" not in key]\n\n        if f\"encoder.down.{i}.downsample.conv.weight\" in vae_state_dict:\n            new_checkpoint[f\"encoder.down_blocks.{i}.downsamplers.0.conv.weight\"] = vae_state_dict.pop(\n                f\"encoder.down.{i}.downsample.conv.weight\"\n            )\n            new_checkpoint[f\"encoder.down_blocks.{i}.downsamplers.0.conv.bias\"] = vae_state_dict.pop(\n                f\"encoder.down.{i}.downsample.conv.bias\"\n            )\n\n        paths = renew_vae_resnet_paths(resnets)\n        meta_path = {\"old\": f\"down.{i}.block\", \"new\": f\"down_blocks.{i}.resnets\"}\n        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)\n\n    mid_resnets = [key for key in vae_state_dict if \"encoder.mid.block\" in key]\n    num_mid_res_blocks = 2\n    for i in range(1, num_mid_res_blocks + 1):\n        resnets = [key for key in mid_resnets if f\"encoder.mid.block_{i}\" in key]\n\n        paths = renew_vae_resnet_paths(resnets)\n        meta_path = {\"old\": f\"mid.block_{i}\", \"new\": f\"mid_block.resnets.{i - 1}\"}\n        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)\n\n    mid_attentions = [key for key in vae_state_dict if \"encoder.mid.attn\" in key]\n    paths = renew_vae_attention_paths(mid_attentions)\n    meta_path = {\"old\": \"mid.attn_1\", \"new\": \"mid_block.attentions.0\"}\n    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)\n    conv_attn_to_linear(new_checkpoint)\n\n    for i in range(num_up_blocks):\n        block_id = num_up_blocks - 1 - i\n        resnets = [\n            key for key in up_blocks[block_id] if f\"up.{block_id}\" in key and f\"up.{block_id}.upsample\" not in key\n        ]\n\n        if f\"decoder.up.{block_id}.upsample.conv.weight\" in vae_state_dict:\n            new_checkpoint[f\"decoder.up_blocks.{i}.upsamplers.0.conv.weight\"] = vae_state_dict[\n                f\"decoder.up.{block_id}.upsample.conv.weight\"\n            ]\n            new_checkpoint[f\"decoder.up_blocks.{i}.upsamplers.0.conv.bias\"] = vae_state_dict[\n                f\"decoder.up.{block_id}.upsample.conv.bias\"\n            ]\n\n        paths = renew_vae_resnet_paths(resnets)\n        meta_path = {\"old\": f\"up.{block_id}.block\", \"new\": f\"up_blocks.{i}.resnets\"}\n        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)\n\n    mid_resnets = [key for key in vae_state_dict if \"decoder.mid.block\" in key]\n    num_mid_res_blocks = 2\n    for i in range(1, num_mid_res_blocks + 1):\n        resnets = [key for key in mid_resnets if f\"decoder.mid.block_{i}\" in key]\n\n        paths = renew_vae_resnet_paths(resnets)\n        meta_path = {\"old\": f\"mid.block_{i}\", \"new\": f\"mid_block.resnets.{i - 1}\"}\n        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)\n\n    mid_attentions = [key for key in vae_state_dict if \"decoder.mid.attn\" in key]\n    paths = renew_vae_attention_paths(mid_attentions)\n    meta_path = {\"old\": \"mid.attn_1\", \"new\": \"mid_block.attentions.0\"}\n    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)\n    conv_attn_to_linear(new_checkpoint)\n    return new_checkpoint\n\n\ndef convert_ldm_bert_checkpoint(checkpoint, config):\n    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):\n        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight\n        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight\n        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight\n\n        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight\n        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias\n\n    def _copy_linear(hf_linear, pt_linear):\n        hf_linear.weight = pt_linear.weight\n        hf_linear.bias = pt_linear.bias\n\n    def _copy_layer(hf_layer, pt_layer):\n        # copy layer norms\n        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])\n        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])\n\n        # copy attn\n        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])\n\n        # copy MLP\n        pt_mlp = pt_layer[1][1]\n        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])\n        _copy_linear(hf_layer.fc2, pt_mlp.net[2])\n\n    def _copy_layers(hf_layers, pt_layers):\n        for i, hf_layer in enumerate(hf_layers):\n            if i != 0:\n                i += i\n            pt_layer = pt_layers[i : i + 2]\n            _copy_layer(hf_layer, pt_layer)\n\n    hf_model = LDMBertModel(config).eval()\n\n    # copy  embeds\n    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight\n    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight\n\n    # copy layer norm\n    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)\n\n    # copy hidden layers\n    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)\n\n    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)\n\n    return hf_model\n\n\ndef convert_ldm_clip_checkpoint(checkpoint):\n    text_model = CLIPTextModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n    keys = list(checkpoint.keys())\n\n    text_model_dict = {}\n\n    for key in keys:\n        if key.startswith(\"cond_stage_model.transformer\"):\n            text_model_dict[key[len(\"cond_stage_model.transformer.\") :]] = checkpoint[key]\n\n    text_model.load_state_dict(text_model_dict)\n\n    return text_model\n\n\ntextenc_conversion_lst = [\n    (\"cond_stage_model.model.positional_embedding\", \"text_model.embeddings.position_embedding.weight\"),\n    (\"cond_stage_model.model.token_embedding.weight\", \"text_model.embeddings.token_embedding.weight\"),\n    (\"cond_stage_model.model.ln_final.weight\", \"text_model.final_layer_norm.weight\"),\n    (\"cond_stage_model.model.ln_final.bias\", \"text_model.final_layer_norm.bias\"),\n]\ntextenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}\n\ntextenc_transformer_conversion_lst = [\n    # (stable-diffusion, HF Diffusers)\n    (\"resblocks.\", \"text_model.encoder.layers.\"),\n    (\"ln_1\", \"layer_norm1\"),\n    (\"ln_2\", \"layer_norm2\"),\n    (\".c_fc.\", \".fc1.\"),\n    (\".c_proj.\", \".fc2.\"),\n    (\".attn\", \".self_attn\"),\n    (\"ln_final.\", \"transformer.text_model.final_layer_norm.\"),\n    (\"token_embedding.weight\", \"transformer.text_model.embeddings.token_embedding.weight\"),\n    (\"positional_embedding\", \"transformer.text_model.embeddings.position_embedding.weight\"),\n]\nprotected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}\ntextenc_pattern = re.compile(\"|\".join(protected.keys()))\n\n\ndef convert_paint_by_example_checkpoint(checkpoint):\n    config = CLIPVisionConfig.from_pretrained(\"openai/clip-vit-large-patch14\")\n    model = PaintByExampleImageEncoder(config)\n\n    keys = list(checkpoint.keys())\n\n    text_model_dict = {}\n\n    for key in keys:\n        if key.startswith(\"cond_stage_model.transformer\"):\n            text_model_dict[key[len(\"cond_stage_model.transformer.\") :]] = checkpoint[key]\n\n    # load clip vision\n    model.model.load_state_dict(text_model_dict)\n\n    # load mapper\n    keys_mapper = {\n        k[len(\"cond_stage_model.mapper.res\") :]: v\n        for k, v in checkpoint.items()\n        if k.startswith(\"cond_stage_model.mapper\")\n    }\n\n    MAPPING = {\n        \"attn.c_qkv\": [\"attn1.to_q\", \"attn1.to_k\", \"attn1.to_v\"],\n        \"attn.c_proj\": [\"attn1.to_out.0\"],\n        \"ln_1\": [\"norm1\"],\n        \"ln_2\": [\"norm3\"],\n        \"mlp.c_fc\": [\"ff.net.0.proj\"],\n        \"mlp.c_proj\": [\"ff.net.2\"],\n    }\n\n    mapped_weights = {}\n    for key, value in keys_mapper.items():\n        prefix = key[: len(\"blocks.i\")]\n        suffix = key.split(prefix)[-1].split(\".\")[-1]\n        name = key.split(prefix)[-1].split(suffix)[0][1:-1]\n        mapped_names = MAPPING[name]\n\n        num_splits = len(mapped_names)\n        for i, mapped_name in enumerate(mapped_names):\n            new_name = \".\".join([prefix, mapped_name, suffix])\n            shape = value.shape[0] // num_splits\n            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]\n\n    model.mapper.load_state_dict(mapped_weights)\n\n    # load final layer norm\n    model.final_layer_norm.load_state_dict(\n        {\n            \"bias\": checkpoint[\"cond_stage_model.final_ln.bias\"],\n            \"weight\": checkpoint[\"cond_stage_model.final_ln.weight\"],\n        }\n    )\n\n    # load final proj\n    model.proj_out.load_state_dict(\n        {\n            \"bias\": checkpoint[\"proj_out.bias\"],\n            \"weight\": checkpoint[\"proj_out.weight\"],\n        }\n    )\n\n    # load uncond vector\n    model.uncond_vector.data = torch.nn.Parameter(checkpoint[\"learnable_vector\"])\n    return model\n\n\ndef convert_open_clip_checkpoint(checkpoint):\n    text_model = CLIPTextModel.from_pretrained(\"stabilityai/stable-diffusion-2\", subfolder=\"text_encoder\")\n\n    keys = list(checkpoint.keys())\n\n    text_model_dict = {}\n\n    if \"cond_stage_model.model.text_projection\" in checkpoint:\n        d_model = int(checkpoint[\"cond_stage_model.model.text_projection\"].shape[0])\n    else:\n        d_model = 1024\n\n    text_model_dict[\"text_model.embeddings.position_ids\"] = text_model.text_model.embeddings.get_buffer(\"position_ids\")\n\n    for key in keys:\n        if \"resblocks.23\" in key:  # Diffusers drops the final layer and only uses the penultimate layer\n            continue\n        if key in textenc_conversion_map:\n            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]\n        if key.startswith(\"cond_stage_model.model.transformer.\"):\n            new_key = key[len(\"cond_stage_model.model.transformer.\") :]\n            if new_key.endswith(\".in_proj_weight\"):\n                new_key = new_key[: -len(\".in_proj_weight\")]\n                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)\n                text_model_dict[new_key + \".q_proj.weight\"] = checkpoint[key][:d_model, :]\n                text_model_dict[new_key + \".k_proj.weight\"] = checkpoint[key][d_model : d_model * 2, :]\n                text_model_dict[new_key + \".v_proj.weight\"] = checkpoint[key][d_model * 2 :, :]\n            elif new_key.endswith(\".in_proj_bias\"):\n                new_key = new_key[: -len(\".in_proj_bias\")]\n                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)\n                text_model_dict[new_key + \".q_proj.bias\"] = checkpoint[key][:d_model]\n                text_model_dict[new_key + \".k_proj.bias\"] = checkpoint[key][d_model : d_model * 2]\n                text_model_dict[new_key + \".v_proj.bias\"] = checkpoint[key][d_model * 2 :]\n            else:\n                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)\n\n                text_model_dict[new_key] = checkpoint[key]\n\n    text_model.load_state_dict(text_model_dict)\n\n    return text_model\n\n\ndef stable_unclip_image_encoder(original_config):\n    \"\"\"\n    Returns the image processor and clip image encoder for the img2img unclip pipeline.\n\n    We currently know of two types of stable unclip models which separately use the clip and the openclip image\n    encoders.\n    \"\"\"\n\n    image_embedder_config = original_config.model.params.embedder_config\n\n    sd_clip_image_embedder_class = image_embedder_config.target\n    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(\".\")[-1]\n\n    if sd_clip_image_embedder_class == \"ClipImageEmbedder\":\n        clip_model_name = image_embedder_config.params.model\n\n        if clip_model_name == \"ViT-L/14\":\n            feature_extractor = CLIPImageProcessor()\n            image_encoder = CLIPVisionModelWithProjection.from_pretrained(\"openai/clip-vit-large-patch14\")\n        else:\n            raise NotImplementedError(f\"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}\")\n\n    elif sd_clip_image_embedder_class == \"FrozenOpenCLIPImageEmbedder\":\n        feature_extractor = CLIPImageProcessor()\n        image_encoder = CLIPVisionModelWithProjection.from_pretrained(\"laion/CLIP-ViT-H-14-laion2B-s32B-b79K\")\n    else:\n        raise NotImplementedError(\n            f\"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}\"\n        )\n\n    return feature_extractor, image_encoder\n\n\ndef stable_unclip_image_noising_components(\n    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None\n):\n    \"\"\"\n    Returns the noising components for the img2img and txt2img unclip pipelines.\n\n    Converts the stability noise augmentor into\n    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats\n    2. a `DDPMScheduler` for holding the noise schedule\n\n    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.\n    \"\"\"\n    noise_aug_config = original_config.model.params.noise_aug_config\n    noise_aug_class = noise_aug_config.target\n    noise_aug_class = noise_aug_class.split(\".\")[-1]\n\n    if noise_aug_class == \"CLIPEmbeddingNoiseAugmentation\":\n        noise_aug_config = noise_aug_config.params\n        embedding_dim = noise_aug_config.timestep_dim\n        max_noise_level = noise_aug_config.noise_schedule_config.timesteps\n        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule\n\n        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)\n        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)\n\n        if \"clip_stats_path\" in noise_aug_config:\n            if clip_stats_path is None:\n                raise ValueError(\"This stable unclip config requires a `clip_stats_path`\")\n\n            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)\n            clip_mean = clip_mean[None, :]\n            clip_std = clip_std[None, :]\n\n            clip_stats_state_dict = {\n                \"mean\": clip_mean,\n                \"std\": clip_std,\n            }\n\n            image_normalizer.load_state_dict(clip_stats_state_dict)\n    else:\n        raise NotImplementedError(f\"Unknown noise augmentor class: {noise_aug_class}\")\n\n    return image_normalizer, image_noising_scheduler\n\n\ndef convert_controlnet_checkpoint(\n    checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema\n):\n    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)\n    ctrlnet_config[\"upcast_attention\"] = upcast_attention\n\n    ctrlnet_config.pop(\"sample_size\")\n\n    controlnet_model = ControlNetModel(**ctrlnet_config)\n\n    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(\n        checkpoint, ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, controlnet=True\n    )\n\n    controlnet_model.load_state_dict(converted_ctrl_checkpoint)\n\n    return controlnet_model\n"
  },
  {
    "path": "animatediff/utils/convert_lora_safetensor_to_diffusers.py",
    "content": "# coding=utf-8\n# Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# \n#  Changes were made to this source code by Yuwei Guo.\n\"\"\" Conversion script for the LoRA's safetensors checkpoints. \"\"\"\n\nimport argparse\n\nimport torch\nfrom safetensors.torch import load_file\n\nfrom diffusers import StableDiffusionPipeline\n\n\ndef load_diffusers_lora(pipeline, state_dict, alpha=1.0):\n    # directly update weight in diffusers model\n    for key in state_dict:\n        # only process lora down key\n        if \"up.\" in key: continue\n\n        up_key    = key.replace(\".down.\", \".up.\")\n        model_key = key.replace(\"processor.\", \"\").replace(\"_lora\", \"\").replace(\"down.\", \"\").replace(\"up.\", \"\")\n        model_key = model_key.replace(\"to_out.\", \"to_out.0.\")\n        layer_infos = model_key.split(\".\")[:-1]\n\n        curr_layer = pipeline.unet\n        while len(layer_infos) > 0:\n            temp_name = layer_infos.pop(0)\n            curr_layer = curr_layer.__getattr__(temp_name)\n\n        weight_down = state_dict[key]\n        weight_up   = state_dict[up_key]\n        curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)\n\n    return pipeline\n\n\ndef convert_lora(pipeline, state_dict, LORA_PREFIX_UNET=\"lora_unet\", LORA_PREFIX_TEXT_ENCODER=\"lora_te\", alpha=0.6):\n    # load base model\n    # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)\n\n    # load LoRA weight from .safetensors\n    # state_dict = load_file(checkpoint_path)\n\n    visited = []\n\n    # directly update weight in diffusers model\n    for key in state_dict:\n        # it is suggested to print out the key, it usually will be something like below\n        # \"lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight\"\n\n        # as we have set the alpha beforehand, so just skip\n        if \".alpha\" in key or key in visited:\n            continue\n\n        if \"text\" in key:\n            layer_infos = key.split(\".\")[0].split(LORA_PREFIX_TEXT_ENCODER + \"_\")[-1].split(\"_\")\n            curr_layer = pipeline.text_encoder\n        else:\n            layer_infos = key.split(\".\")[0].split(LORA_PREFIX_UNET + \"_\")[-1].split(\"_\")\n            curr_layer = pipeline.unet\n\n        # find the target layer\n        temp_name = layer_infos.pop(0)\n        while len(layer_infos) > -1:\n            try:\n                curr_layer = curr_layer.__getattr__(temp_name)\n                if len(layer_infos) > 0:\n                    temp_name = layer_infos.pop(0)\n                elif len(layer_infos) == 0:\n                    break\n            except Exception:\n                if len(temp_name) > 0:\n                    temp_name += \"_\" + layer_infos.pop(0)\n                else:\n                    temp_name = layer_infos.pop(0)\n\n        pair_keys = []\n        if \"lora_down\" in key:\n            pair_keys.append(key.replace(\"lora_down\", \"lora_up\"))\n            pair_keys.append(key)\n        else:\n            pair_keys.append(key)\n            pair_keys.append(key.replace(\"lora_up\", \"lora_down\"))\n\n        # update weight\n        if len(state_dict[pair_keys[0]].shape) == 4:\n            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)\n            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)\n            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)\n        else:\n            weight_up = state_dict[pair_keys[0]].to(torch.float32)\n            weight_down = state_dict[pair_keys[1]].to(torch.float32)\n            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)\n\n        # update visited list\n        for item in pair_keys:\n            visited.append(item)\n\n    return pipeline\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument(\n        \"--base_model_path\", default=None, type=str, required=True, help=\"Path to the base model in diffusers format.\"\n    )\n    parser.add_argument(\n        \"--checkpoint_path\", default=None, type=str, required=True, help=\"Path to the checkpoint to convert.\"\n    )\n    parser.add_argument(\"--dump_path\", default=None, type=str, required=True, help=\"Path to the output model.\")\n    parser.add_argument(\n        \"--lora_prefix_unet\", default=\"lora_unet\", type=str, help=\"The prefix of UNet weight in safetensors\"\n    )\n    parser.add_argument(\n        \"--lora_prefix_text_encoder\",\n        default=\"lora_te\",\n        type=str,\n        help=\"The prefix of text encoder weight in safetensors\",\n    )\n    parser.add_argument(\"--alpha\", default=0.75, type=float, help=\"The merging ratio in W = W0 + alpha * deltaW\")\n    parser.add_argument(\n        \"--to_safetensors\", action=\"store_true\", help=\"Whether to store pipeline in safetensors format or not.\"\n    )\n    parser.add_argument(\"--device\", type=str, help=\"Device to use (e.g. cpu, cuda:0, cuda:1, etc.)\")\n\n    args = parser.parse_args()\n\n    base_model_path = args.base_model_path\n    checkpoint_path = args.checkpoint_path\n    dump_path = args.dump_path\n    lora_prefix_unet = args.lora_prefix_unet\n    lora_prefix_text_encoder = args.lora_prefix_text_encoder\n    alpha = args.alpha\n\n    pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)\n\n    pipe = pipe.to(args.device)\n    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)\n"
  },
  {
    "path": "animatediff/utils/util.py",
    "content": "import os\nimport imageio\nimport numpy as np\nfrom typing import Union\n\nimport torch\nimport torchvision\nimport torch.distributed as dist\n\nfrom huggingface_hub import snapshot_download\nfrom safetensors import safe_open\nfrom tqdm import tqdm\nfrom einops import rearrange\nfrom animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint\nfrom animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora, load_diffusers_lora\n\n\nMOTION_MODULES = [\n    \"mm_sd_v14.ckpt\", \n    \"mm_sd_v15.ckpt\", \n    \"mm_sd_v15_v2.ckpt\", \n    \"v3_sd15_mm.ckpt\",\n]\n\nADAPTERS = [\n    # \"mm_sd_v14.ckpt\",\n    # \"mm_sd_v15.ckpt\",\n    # \"mm_sd_v15_v2.ckpt\",\n    # \"mm_sdxl_v10_beta.ckpt\",\n    \"v2_lora_PanLeft.ckpt\",\n    \"v2_lora_PanRight.ckpt\",\n    \"v2_lora_RollingAnticlockwise.ckpt\",\n    \"v2_lora_RollingClockwise.ckpt\",\n    \"v2_lora_TiltDown.ckpt\",\n    \"v2_lora_TiltUp.ckpt\",\n    \"v2_lora_ZoomIn.ckpt\",\n    \"v2_lora_ZoomOut.ckpt\",\n    \"v3_sd15_adapter.ckpt\",\n    # \"v3_sd15_mm.ckpt\",\n    \"v3_sd15_sparsectrl_rgb.ckpt\",\n    \"v3_sd15_sparsectrl_scribble.ckpt\",\n]\n\nBACKUP_DREAMBOOTH_MODELS = [\n    \"realisticVisionV60B1_v51VAE.safetensors\",\n    \"majicmixRealistic_v4.safetensors\",\n    \"leosamsFilmgirlUltra_velvia20Lora.safetensors\",\n    \"toonyou_beta3.safetensors\",\n    \"majicmixRealistic_v5Preview.safetensors\",\n    \"rcnzCartoon3d_v10.safetensors\",\n    \"lyriel_v16.safetensors\",\n    \"leosamsHelloworldXL_filmGrain20.safetensors\",\n    \"TUSUN.safetensors\",\n]\n\n\ndef zero_rank_print(s):\n    if (not dist.is_initialized()) and (dist.is_initialized() and dist.get_rank() == 0): print(\"### \" + s)\n\n\ndef save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):\n    videos = rearrange(videos, \"b c t h w -> t b c h w\")\n    outputs = []\n    for x in videos:\n        x = torchvision.utils.make_grid(x, nrow=n_rows)\n        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)\n        if rescale:\n            x = (x + 1.0) / 2.0  # -1,1 -> 0,1\n        x = (x * 255).numpy().astype(np.uint8)\n        outputs.append(x)\n\n    os.makedirs(os.path.dirname(path), exist_ok=True)\n    imageio.mimsave(path, outputs, fps=fps)\n\n\ndef auto_download(local_path, is_dreambooth_lora=False):\n    hf_repo = \"guoyww/animatediff_t2i_backups\" if is_dreambooth_lora else \"guoyww/animatediff\"\n    folder, filename = os.path.split(local_path)\n\n    if not os.path.exists(local_path):\n        print(f\"local file {local_path} does not exist. trying to download from {hf_repo}\")\n\n        if is_dreambooth_lora: assert filename in BACKUP_DREAMBOOTH_MODELS, f\"{filename} dose not exist in {hf_repo}\"\n        else: assert filename in MOTION_MODULES + ADAPTERS, f\"{filename} dose not exist in {hf_repo}\"\n\n        folder = \".\" if folder == \"\" else folder\n        os.makedirs(folder, exist_ok=True)\n        snapshot_download(repo_id=hf_repo, local_dir=folder, allow_patterns=[filename])\n\n\ndef load_weights(\n    animation_pipeline,\n    # motion module\n    motion_module_path         = \"\",\n    motion_module_lora_configs = [],\n    # domain adapter\n    adapter_lora_path          = \"\",\n    adapter_lora_scale         = 1.0,\n    # image layers\n    dreambooth_model_path      = \"\",\n    lora_model_path            = \"\",\n    lora_alpha                 = 0.8,\n):\n    # motion module\n    unet_state_dict = {}\n    if motion_module_path != \"\":\n        auto_download(motion_module_path, is_dreambooth_lora=False)\n\n        print(f\"load motion module from {motion_module_path}\")\n        motion_module_state_dict = torch.load(motion_module_path, map_location=\"cpu\")\n        motion_module_state_dict = motion_module_state_dict[\"state_dict\"] if \"state_dict\" in motion_module_state_dict else motion_module_state_dict\n        # filter parameters\n        for name, param in motion_module_state_dict.items():\n            if not \"motion_modules.\" in name: continue\n            if \"pos_encoder.pe\" in name: continue\n            unet_state_dict.update({name: param})\n        unet_state_dict.pop(\"animatediff_config\", \"\")\n    \n    missing, unexpected = animation_pipeline.unet.load_state_dict(unet_state_dict, strict=False)\n    assert len(unexpected) == 0\n    del unet_state_dict\n\n    # base model\n    if dreambooth_model_path != \"\":\n        auto_download(dreambooth_model_path, is_dreambooth_lora=True)\n\n        print(f\"load dreambooth model from {dreambooth_model_path}\")\n        if dreambooth_model_path.endswith(\".safetensors\"):\n            dreambooth_state_dict = {}\n            with safe_open(dreambooth_model_path, framework=\"pt\", device=\"cpu\") as f:\n                for key in f.keys():\n                    dreambooth_state_dict[key] = f.get_tensor(key)\n        elif dreambooth_model_path.endswith(\".ckpt\"):\n            dreambooth_state_dict = torch.load(dreambooth_model_path, map_location=\"cpu\")\n            \n        # 1. vae\n        converted_vae_checkpoint = convert_ldm_vae_checkpoint(dreambooth_state_dict, animation_pipeline.vae.config)\n        animation_pipeline.vae.load_state_dict(converted_vae_checkpoint)\n        # 2. unet\n        converted_unet_checkpoint = convert_ldm_unet_checkpoint(dreambooth_state_dict, animation_pipeline.unet.config)\n        animation_pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)\n        # 3. text_model\n        animation_pipeline.text_encoder = convert_ldm_clip_checkpoint(dreambooth_state_dict)\n        del dreambooth_state_dict\n        \n    # lora layers\n    if lora_model_path != \"\":\n        auto_download(lora_model_path, is_dreambooth_lora=True)\n\n        print(f\"load lora model from {lora_model_path}\")\n        assert lora_model_path.endswith(\".safetensors\")\n        lora_state_dict = {}\n        with safe_open(lora_model_path, framework=\"pt\", device=\"cpu\") as f:\n            for key in f.keys():\n                lora_state_dict[key] = f.get_tensor(key)\n                \n        animation_pipeline = convert_lora(animation_pipeline, lora_state_dict, alpha=lora_alpha)\n        del lora_state_dict\n\n    # domain adapter lora\n    if adapter_lora_path != \"\":\n        auto_download(adapter_lora_path, is_dreambooth_lora=False)\n\n        print(f\"load domain lora from {adapter_lora_path}\")\n        domain_lora_state_dict = torch.load(adapter_lora_path, map_location=\"cpu\")\n        domain_lora_state_dict = domain_lora_state_dict[\"state_dict\"] if \"state_dict\" in domain_lora_state_dict else domain_lora_state_dict\n        domain_lora_state_dict.pop(\"animatediff_config\", \"\")\n\n        animation_pipeline = load_diffusers_lora(animation_pipeline, domain_lora_state_dict, alpha=adapter_lora_scale)\n\n    # motion module lora\n    for motion_module_lora_config in motion_module_lora_configs:\n        path, alpha = motion_module_lora_config[\"path\"], motion_module_lora_config[\"alpha\"]\n\n        auto_download(path, is_dreambooth_lora=False)\n\n        print(f\"load motion LoRA from {path}\")\n        motion_lora_state_dict = torch.load(path, map_location=\"cpu\")\n        motion_lora_state_dict = motion_lora_state_dict[\"state_dict\"] if \"state_dict\" in motion_lora_state_dict else motion_lora_state_dict\n        motion_lora_state_dict.pop(\"animatediff_config\", \"\")\n\n        animation_pipeline = load_diffusers_lora(animation_pipeline, motion_lora_state_dict, alpha)\n\n    return animation_pipeline\n"
  },
  {
    "path": "app.py",
    "content": "\nimport os\nimport json\nimport torch\nimport random\n\nimport gradio as gr\nfrom glob import glob\nfrom omegaconf import OmegaConf\nfrom datetime import datetime\nfrom safetensors import safe_open\n\nfrom diffusers import AutoencoderKL\nfrom diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler\nfrom diffusers.utils.import_utils import is_xformers_available\nfrom transformers import CLIPTextModel, CLIPTokenizer\n\nfrom animatediff.models.unet import UNet3DConditionModel\nfrom animatediff.pipelines.pipeline_animation import AnimationPipeline\nfrom animatediff.utils.util import save_videos_grid, load_weights, auto_download, MOTION_MODULES, BACKUP_DREAMBOOTH_MODELS\nfrom animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint\nfrom animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora\nimport pdb\n\n\nsample_idx = 0\nscheduler_dict = {\n    \"DDIM\": DDIMScheduler,\n    \"Euler\": EulerDiscreteScheduler,\n    \"PNDM\": PNDMScheduler,\n}\n\ncss = \"\"\"\n.toolbutton {\n    margin-buttom: 0em 0em 0em 0em;\n    max-width: 2.5em;\n    min-width: 2.5em !important;\n    height: 2.5em;\n}\n\"\"\"\n\nPRETRAINED_SD = \"runwayml/stable-diffusion-v1-5\"\n\ndefault_motion_module = \"v3_sd15_mm.ckpt\"\ndefault_inference_config = \"configs/inference/inference-v3.yaml\"\ndefault_dreambooth_model = \"realisticVisionV60B1_v51VAE.safetensors\"\ndefault_prompt = \"b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\ndefault_n_prompt = \"semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck\"\ndefault_seed = 8893659352891878017\n\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n\nclass AnimateController:\n    def __init__(self):\n        # config dirs\n        self.basedir = os.getcwd()\n        self.stable_diffusion_dir = os.path.join(self.basedir, \"models\", \"StableDiffusion\")\n        self.motion_module_dir = os.path.join(self.basedir, \"models\", \"Motion_Module\")\n        self.personalized_model_dir = os.path.join(self.basedir, \"models\", \"DreamBooth_LoRA\")\n        self.savedir = os.path.join(self.basedir, \"samples\", datetime.now().strftime(\"Gradio-%Y-%m-%dT%H-%M-%S\"))\n        self.savedir_sample = os.path.join(self.savedir, \"sample\")\n        os.makedirs(self.savedir, exist_ok=True)\n\n        self.stable_diffusion_list = [PRETRAINED_SD]\n        self.motion_module_list = MOTION_MODULES\n        self.personalized_model_list = BACKUP_DREAMBOOTH_MODELS\n        \n        # config models\n        self.pipeline = None\n        # self.lora_model_state_dict = {}\n        \n        self.refresh_stable_diffusion()\n        self.refresh_personalized_model()\n        \n        # default setting\n        self.update_pipeline(\n            stable_diffusion_dropdown=PRETRAINED_SD,\n            motion_module_dropdown=default_motion_module,\n            base_model_dropdown=default_dreambooth_model,\n            sampler_dropdown=\"DDIM\",\n        )\n\n    def refresh_stable_diffusion(self):\n        self.stable_diffusion_list = [PRETRAINED_SD] + glob(os.path.join(self.stable_diffusion_dir, \"*/\"))\n\n    def refresh_personalized_model(self):\n        personalized_model_list = glob(os.path.join(self.personalized_model_dir, \"*.safetensors\"))\n        self.personalized_model_list = BACKUP_DREAMBOOTH_MODELS + [os.path.basename(p) for p in personalized_model_list if os.path.basename(p) not in BACKUP_DREAMBOOTH_MODELS]\n\n    # for dropdown update\n    def update_pipeline(\n        self,\n        stable_diffusion_dropdown,\n        motion_module_dropdown,\n        base_model_dropdown=\"\",\n        lora_model_dropdown=\"none\",\n        lora_alpha_dropdown=\"0.6\",\n        sampler_dropdown=\"DDIM\",\n    ):\n        if \"v2\" in motion_module_dropdown:\n            inference_config = \"configs/inference/inference-v2.yaml\"\n        elif \"v3\" in motion_module_dropdown:\n            inference_config = \"configs/inference/inference-v3.yaml\"\n        else:\n            inference_config = \"configs/inference/inference-v1.yaml\"\n\n        unet = UNet3DConditionModel.from_pretrained_2d(\n            stable_diffusion_dropdown, subfolder=\"unet\", \n            unet_additional_kwargs=OmegaConf.load(inference_config).unet_additional_kwargs\n        )\n        if is_xformers_available() and torch.cuda.is_available():\n            unet.enable_xformers_memory_efficient_attention()\n\n        noise_scheduler_cls = scheduler_dict[sampler_dropdown]\n        noise_scheduler_kwargs = OmegaConf.load(inference_config).noise_scheduler_kwargs\n        if noise_scheduler_cls == EulerDiscreteScheduler:\n            noise_scheduler_kwargs.pop(\"steps_offset\")\n            noise_scheduler_kwargs.pop(\"clip_sample\")\n        elif noise_scheduler_cls == PNDMScheduler:\n            noise_scheduler_kwargs.pop(\"clip_sample\")\n\n        pipeline = AnimationPipeline(\n            unet=unet,\n            vae=AutoencoderKL.from_pretrained(stable_diffusion_dropdown, subfolder=\"vae\"), \n            text_encoder=CLIPTextModel.from_pretrained(stable_diffusion_dropdown, subfolder=\"text_encoder\"), \n            tokenizer=CLIPTokenizer.from_pretrained(stable_diffusion_dropdown, subfolder=\"tokenizer\"), \n            scheduler=noise_scheduler_cls(**noise_scheduler_kwargs),\n        )\n\n        pipeline = load_weights(\n            pipeline,\n            motion_module_path=os.path.join(self.motion_module_dir, motion_module_dropdown),\n            dreambooth_model_path=os.path.join(self.personalized_model_dir, base_model_dropdown) if base_model_dropdown != \"\" else \"\",\n            lora_model_path=os.path.join(self.personalized_model_dir, lora_model_dropdown) if lora_model_dropdown != \"none\" else \"\",\n            lora_alpha=float(lora_alpha_dropdown),\n        )\n\n        pipeline.to(device)\n        self.pipeline = pipeline\n        print(\"done.\")\n\n        return gr.Dropdown.update()\n\n    def update_pipeline_alpha(\n        self,\n        stable_diffusion_dropdown,\n        motion_module_dropdown,\n        base_model_dropdown=\"\",\n        lora_model_dropdown=\"none\",\n        lora_alpha_dropdown=\"0.6\",\n        sampler_dropdown=\"DDIM\",\n    ):\n        if lora_model_dropdown == \"none\":\n            return gr.Slider.update()\n\n        self.update_pipeline(\n            stable_diffusion_dropdown=stable_diffusion_dropdown,\n            motion_module_dropdown=motion_module_dropdown,\n            base_model_dropdown=base_model_dropdown,\n            lora_model_dropdown=lora_model_dropdown,\n            lora_alpha_dropdown=lora_alpha_dropdown,\n            sampler_dropdown=sampler_dropdown,\n        )\n\n        return gr.Slider.update()\n\n\n    @torch.no_grad()\n    def animate(\n        self,\n        prompt_textbox,\n        negative_prompt_textbox,\n        sampler_dropdown,\n        sample_step_slider,\n        width_slider,\n        length_slider,\n        height_slider,\n        cfg_scale_slider,\n        seed_textbox,\n    ):\n        if int(seed_textbox) != -1:\n            torch.manual_seed(int(seed_textbox))\n        else:\n            torch.seed()\n        seed = torch.initial_seed()\n        \n        sample = self.pipeline(\n            prompt_textbox,\n            negative_prompt = negative_prompt_textbox,\n            num_inference_steps = sample_step_slider,\n            guidance_scale = cfg_scale_slider,\n            width = width_slider,\n            height = height_slider,\n            video_length = length_slider,\n        ).videos\n\n        save_sample_path = os.path.join(self.savedir_sample, f\"{sample_idx}.mp4\")\n        save_videos_grid(sample, save_sample_path)\n    \n        sample_config = {\n            \"prompt\": prompt_textbox,\n            \"n_prompt\": negative_prompt_textbox,\n            \"sampler\": sampler_dropdown,\n            \"num_inference_steps\": sample_step_slider,\n            \"guidance_scale\": cfg_scale_slider,\n            \"width\": width_slider,\n            \"height\": height_slider,\n            \"video_length\": length_slider,\n            \"seed\": seed\n        }\n\n        json_str = json.dumps(sample_config, indent=4)\n        with open(os.path.join(self.savedir, \"logs.json\"), \"a\") as f:\n            f.write(json_str)\n            f.write(\"\\n\\n\")\n            \n        return gr.Video.update(value=save_sample_path)\n        \n\ncontroller = AnimateController()\n\n\ndef ui():\n    with gr.Blocks(css=css) as demo:\n        gr.Markdown(\n            \"\"\"\n            # AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning\n            Yuwei Guo, Ceyuan Yang✝, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, Bo Dai (✝Corresponding Author)<br>\n            [Paper](https://arxiv.org/abs/2307.04725) | [Webpage](https://animatediff.github.io/) | [Github](https://github.com/guoyww/animatediff/)\n            \"\"\"\n        )\n        with gr.Column(variant=\"panel\"):\n            gr.Markdown(\n                \"\"\"\n                ### 1. Model Checkpoints\n                \"\"\"\n            )\n            with gr.Row():\n                stable_diffusion_dropdown = gr.Dropdown(\n                    label=\"Pretrained Model Path\",\n                    choices=controller.stable_diffusion_list,\n                    value=PRETRAINED_SD,\n                    interactive=True,\n                )\n                \n            with gr.Row():\n                motion_module_dropdown = gr.Dropdown(\n                    label=\"Select motion module\",\n                    choices=controller.motion_module_list,\n                    value=default_motion_module,\n                    interactive=True,\n                )\n                                \n                base_model_dropdown = gr.Dropdown(\n                    label=\"Select base Dreambooth model (required)\",\n                    choices=controller.personalized_model_list,\n                    value=default_dreambooth_model,\n                    interactive=True,\n                )\n                \n                lora_model_dropdown = gr.Dropdown(\n                    label=\"Select LoRA model (optional)\",\n                    choices=[\"none\"] + controller.personalized_model_list,\n                    value=\"none\",\n                    interactive=True,\n                )\n                \n                lora_alpha_dropdown = gr.Dropdown(\n                    label=\"LoRA alpha\", \n                    choices=[\"0.\", \"0.2\", \"0.4\", \"0.6\", \"0.8\", \"1.0\"],\n                    value=\"0.6\",\n                    interactive=True,\n                )\n                \n                personalized_refresh_button = gr.Button(value=\"\\U0001F503\", elem_classes=\"toolbutton\")\n                def update_personalized_model():\n                    controller.refresh_stable_diffusion()\n                    controller.refresh_personalized_model()\n                    return [\n                        gr.Dropdown.update(choices=controller.stable_diffusion_list),\n                        gr.Dropdown.update(choices=controller.personalized_model_list),\n                        gr.Dropdown.update(choices=[\"none\"] + controller.personalized_model_list)\n                    ]\n                personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[stable_diffusion_dropdown, base_model_dropdown, lora_model_dropdown])\n\n        with gr.Column(variant=\"panel\"):\n            gr.Markdown(\n                \"\"\"\n                ### 2. Configs for AnimateDiff.\n                \"\"\"\n            )\n            prompt_textbox = gr.Textbox(label=\"Prompt\", lines=2, value=default_prompt)\n            negative_prompt_textbox = gr.Textbox(label=\"Negative prompt\", lines=2, value=default_n_prompt)\n\n            with gr.Row().style(equal_height=False):\n                with gr.Column():\n                    with gr.Row():\n                        sampler_dropdown = gr.Dropdown(label=\"Sampling method\", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])\n                        sample_step_slider = gr.Slider(label=\"Sampling steps\", value=25, minimum=10, maximum=100, step=1)\n                        \n                    width_slider = gr.Slider(label=\"Width\", value=512, minimum=256, maximum=1024, step=64)\n                    height_slider = gr.Slider(label=\"Height\", value=512, minimum=256, maximum=1024, step=64)\n                    length_slider = gr.Slider(label=\"Animation length (default: 16)\", value=16, minimum=8, maximum=24, step=1)\n                    cfg_scale_slider = gr.Slider(label=\"CFG Scale\", value=8.0, minimum=0, maximum=20)\n                    \n                    with gr.Row():\n                        seed_textbox = gr.Textbox(label=\"Seed (-1 for random seed)\", value=default_seed)\n                        seed_button = gr.Button(value=\"\\U0001F3B2\", elem_classes=\"toolbutton\")\n                        seed_button.click(fn=lambda: gr.Textbox.update(value=random.randint(1, 1e8)), inputs=[], outputs=[seed_textbox])\n            \n                    generate_button = gr.Button(value=\"Generate\", variant='primary')\n                    \n                result_video = gr.Video(label=\"Generated Animation\", interactive=False)\n\n            # update method\n            stable_diffusion_dropdown.change(fn=controller.update_pipeline, inputs=[stable_diffusion_dropdown, motion_module_dropdown, base_model_dropdown, lora_model_dropdown, lora_alpha_dropdown, sampler_dropdown], outputs=[stable_diffusion_dropdown])\n            motion_module_dropdown.change(fn=controller.update_pipeline,    inputs=[stable_diffusion_dropdown, motion_module_dropdown, base_model_dropdown, lora_model_dropdown, lora_alpha_dropdown, sampler_dropdown], outputs=[motion_module_dropdown])\n            base_model_dropdown.change(fn=controller.update_pipeline,       inputs=[stable_diffusion_dropdown, motion_module_dropdown, base_model_dropdown, lora_model_dropdown, lora_alpha_dropdown, sampler_dropdown], outputs=[base_model_dropdown])\n            lora_model_dropdown.change(fn=controller.update_pipeline,       inputs=[stable_diffusion_dropdown, motion_module_dropdown, base_model_dropdown, lora_model_dropdown, lora_alpha_dropdown, sampler_dropdown], outputs=[lora_model_dropdown])\n            lora_alpha_dropdown.change(fn=controller.update_pipeline_alpha, inputs=[stable_diffusion_dropdown, motion_module_dropdown, base_model_dropdown, lora_model_dropdown, lora_alpha_dropdown, sampler_dropdown], outputs=[lora_alpha_dropdown])\n\n            generate_button.click(\n                fn=controller.animate,\n                inputs=[\n                    prompt_textbox, \n                    negative_prompt_textbox, \n                    sampler_dropdown, \n                    sample_step_slider, \n                    width_slider, \n                    length_slider, \n                    height_slider, \n                    cfg_scale_slider, \n                    seed_textbox,\n                ],\n                outputs=[result_video]\n            )\n            \n    return demo\n\n\nif __name__ == \"__main__\":\n    demo = ui()\n    demo.launch(share=True)\n"
  },
  {
    "path": "configs/inference/inference-v1.yaml",
    "content": "unet_additional_kwargs:\n  use_inflated_groupnorm:     false\n  use_motion_module:          true\n  motion_module_resolutions:  [1,2,4,8]\n  motion_module_mid_block:    false\n  motion_module_type:         \"Vanilla\"\n  \n  motion_module_kwargs:\n    num_attention_heads:        8\n    num_transformer_block:      1\n    attention_block_types:      [ \"Temporal_Self\", \"Temporal_Self\" ]\n    temporal_position_encoding: true\n    temporal_attention_dim_div: 1\n    zero_initialize:            true\n\nnoise_scheduler_kwargs:\n  beta_start:    0.00085\n  beta_end:      0.012\n  beta_schedule: \"linear\"\n  steps_offset:  1\n  clip_sample:   false\n"
  },
  {
    "path": "configs/inference/inference-v2.yaml",
    "content": "unet_additional_kwargs:\n  use_inflated_groupnorm:     true\n  use_motion_module:          true\n  motion_module_resolutions:  [1,2,4,8]\n  motion_module_mid_block:    true\n  motion_module_type:         \"Vanilla\"\n\n  motion_module_kwargs:\n    num_attention_heads:        8\n    num_transformer_block:      1\n    attention_block_types:      [ \"Temporal_Self\", \"Temporal_Self\" ]\n    temporal_position_encoding: true\n    temporal_attention_dim_div: 1\n    zero_initialize:            true\n\nnoise_scheduler_kwargs:\n  beta_start:    0.00085\n  beta_end:      0.012\n  beta_schedule: \"linear\"\n  steps_offset:  1\n  clip_sample:   false\n"
  },
  {
    "path": "configs/inference/inference-v3.yaml",
    "content": "unet_additional_kwargs:\n  use_inflated_groupnorm:     true\n  use_motion_module:          true\n  motion_module_resolutions:  [1,2,4,8]\n  motion_module_mid_block:    false\n  motion_module_type:         \"Vanilla\"\n\n  motion_module_kwargs:\n    num_attention_heads:        8\n    num_transformer_block:      1\n    attention_block_types:      [ \"Temporal_Self\", \"Temporal_Self\" ]\n    temporal_position_encoding: true\n    temporal_attention_dim_div: 1\n    zero_initialize:            true\n\nnoise_scheduler_kwargs:\n  beta_start:    0.00085\n  beta_end:      0.012\n  beta_schedule: \"linear\"\n  steps_offset:  1\n  clip_sample:   false\n"
  },
  {
    "path": "configs/inference/sparsectrl/image_condition.yaml",
    "content": "controlnet_additional_kwargs:\n  set_noisy_sample_input_to_zero:     true\n  use_simplified_condition_embedding: false\n  conditioning_channels:              3\n\n  use_motion_module:         true\n  motion_module_resolutions: [1,2,4,8]\n  motion_module_mid_block:   false\n  motion_module_type:        \"Vanilla\"\n\n  motion_module_kwargs:\n    num_attention_heads:                8\n    num_transformer_block:              1\n    attention_block_types:              [ \"Temporal_Self\" ]\n    temporal_position_encoding:         true\n    temporal_position_encoding_max_len: 32\n    temporal_attention_dim_div:         1\n"
  },
  {
    "path": "configs/inference/sparsectrl/latent_condition.yaml",
    "content": "controlnet_additional_kwargs:\n  set_noisy_sample_input_to_zero:     true\n  use_simplified_condition_embedding: true\n  conditioning_channels:              4\n\n  use_motion_module:         true\n  motion_module_resolutions: [1,2,4,8]\n  motion_module_mid_block:   false\n  motion_module_type:        \"Vanilla\"\n\n  motion_module_kwargs:\n    num_attention_heads:                8\n    num_transformer_block:              1\n    attention_block_types:              [ \"Temporal_Self\" ]\n    temporal_position_encoding:         true\n    temporal_position_encoding_max_len: 32\n    temporal_attention_dim_div:         1\n"
  },
  {
    "path": "configs/prompts/1_animate/1_1_animate_RealisticVision.yaml",
    "content": "# motion module v3\n- dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module: \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  seed: [8893659352891878017, 9317678091797131699, 43242532350557906, 4162228652802886667]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n    - \"close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot\"\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n    - \"night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain\"\n\n  n_prompt:\n    - \"semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck\"\n    - \"semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck\"\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# motion module v2\n- dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module: \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  seed: [8964153601421814582, 10589116295929063558, 13214918285578813247, 3460258020075528001]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n    - \"close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot\"\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n    - \"night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain\"\n\n  n_prompt:\n    - \"semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck\"\n    - \"semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck\"\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n"
  },
  {
    "path": "configs/prompts/1_animate/1_2_animate_FilmVelvia.yaml",
    "content": "# motion module v1_14\n- dreambooth_path: \"models/DreamBooth_LoRA/majicmixRealistic_v4.safetensors\"\n  lora_model_path: \"models/DreamBooth_LoRA/leosamsFilmgirlUltra_velvia20Lora.safetensors\"\n  lora_alpha: 0.6\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module: \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  seed: [5726977427157971918, 18368660165286593270, 9350384325017735240, 2097615141377450078]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name\"\n    - \"dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir\"\n    - \"fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark\"\n    - \"In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, \"\n\n  n_prompt:\n    - \"cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n    - \"cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n    - \"wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n    - \"wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n\n\n# motion module v1_15\n- dreambooth_path: \"models/DreamBooth_LoRA/majicmixRealistic_v4.safetensors\"\n  lora_model_path: \"models/DreamBooth_LoRA/leosamsFilmgirlUltra_velvia20Lora.safetensors\"\n  lora_alpha: 0.6\n\n  inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module: \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  seed: [2802659149552239028, 12507673598434739425, 1350017671114249824, 2813556755112853775]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name\"\n    - \", dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir\"\n    - \"fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark\"\n    - \"In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, \"\n\n  n_prompt:\n    - \"cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n    - \"cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n    - \"wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n    - \"wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg\"\n"
  },
  {
    "path": "configs/prompts/1_animate/1_3_animate_ToonYou.yaml",
    "content": "# motion module v3\n- dreambooth_path: \"models/DreamBooth_LoRA/toonyou_beta3.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module: \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  seed: [12192490710448890259, 12238800062118732365, 13226337751639812613, 16431231374396590344]\n  steps: 25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress\"\n    - \"masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,\"\n    - \"best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern\"\n    - \"best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,\"\n\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n\n\n# motion module v2\n- dreambooth_path: \"models/DreamBooth_LoRA/toonyou_beta3.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module: \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  seed: [2362336635702964940, 8149279371559927917, 1487371078234460867, 17554906328875363976]\n  steps: 25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress\"\n    - \"masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,\"\n    - \"best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern\"\n    - \"best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,\"\n\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n"
  },
  {
    "path": "configs/prompts/1_animate/1_4_animate_MajicMix.yaml",
    "content": "# motion module v1_14\n- dreambooth_path: \"models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module: \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  seed: [11413213594134208212, 11357183503136546592, 7315638361411279346, 10191753182015596097]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic\"\n    - \"best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting\"\n    - \"best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below\"\n    - \"male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic\"\n\n  n_prompt:\n    - \"ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles\"\n    - \"nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome\"\n    - \"nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome\"\n    - \"nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people\"\n\n\n# motion module v1_15\n- dreambooth_path: \"models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module: \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  seed: [3364626746360550707, 10635741750919791646, 3130334860012077860, 1530101570151479035]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic\"\n    - \"best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting\"\n    - \"best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below\"\n    - \"male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic\"\n\n  n_prompt:\n    - \"ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles\"\n    - \"nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome\"\n    - \"nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome\"\n    - \"nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people\"\n"
  },
  {
    "path": "configs/prompts/1_animate/1_5_animate_RcnzCartoon.yaml",
    "content": "# motion module v1_14\n- dreambooth_path: \"models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module: \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  seed: [8085079222822100088, 15493278891844617620, 17384760730172253253, 3896292336733512420]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded\"\n    - \"close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face\"\n    - \"absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes\"\n    - \"a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering\"\n\n  n_prompt:\n    - \"deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation\"\n    - \"nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular\"\n    - \"easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,\"\n    - \"beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand\"\n\n\n# motion module v1_15\n- dreambooth_path: \"models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module: \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  seed: [10087939632512181573, 6440765888009826001, 4292543217695451092, 14003068315619866795]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded\"\n    - \"close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face\"\n    - \"absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes\"\n    - \"a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering\"\n\n  n_prompt:\n    - \"deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation\"\n    - \"nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular\"\n    - \"easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,\"\n    - \"beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand\"\n"
  },
  {
    "path": "configs/prompts/1_animate/1_6_animate_Lyriel.yaml",
    "content": "# motion module v1_14\n- dreambooth_path: \"models/DreamBooth_LoRA/lyriel_v16.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module: \"models/Motion_Module/v3_sd15_mm.ckpt\"\n  \n  seed: [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange\"\n    - \"A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal\"\n    - \"dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray\"\n    - \"As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown.\"\n\n  n_prompt:\n    - \"3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration\"\n    - \"3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular\"\n    - \"dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome\"\n    - \"holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render\"\n\n\n# motion module v1_15\n- dreambooth_path: \"models/DreamBooth_LoRA/lyriel_v16.safetensors\"\n  lora_model_path: \"\"\n\n  inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module: \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n  \n  seed: [9217823730598265840, 10815047877257294769, 15033600051075248739, 3730216622332453211]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange\"\n    - \"A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal\"\n    - \"dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray\"\n    - \"As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown.\"\n\n  n_prompt:\n    - \"3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration\"\n    - \"3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular\"\n    - \"dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome\"\n    - \"holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render\"\n"
  },
  {
    "path": "configs/prompts/1_animate/1_7_animate_Tusun.yaml",
    "content": "# motion module v1_14\n- dreambooth_path: \"models/DreamBooth_LoRA/leosamsHelloworldXL_filmGrain20.safetensors\"\n  lora_model_path: \"models/DreamBooth_LoRA/TUSUN.safetensors\"\n  lora_alpha: 0.6\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module: \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  seed: [7107114461349773341, 17169636352587613974, 9844335976427375435, 6372518434592560610]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing\"\n    - \"cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing\"\n    - \"cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing\"\n    - \"character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body\"\n\n  n_prompt:\n    - \"worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative\"\n\n\n# motion module v1_15\n- dreambooth_path: \"models/DreamBooth_LoRA/leosamsHelloworldXL_filmGrain20.safetensors\"\n  lora_model_path: \"models/DreamBooth_LoRA/TUSUN.safetensors\"\n  lora_alpha: 0.6\n\n  inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module: \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  seed: [8605999221232672724, 110148213803975296, 9191327304973552413, 174075196208604916]\n  steps: 25\n  guidance_scale: 8\n\n  prompt:\n    - \"tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing\"\n    - \"cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing\"\n    - \"cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing\"\n    - \"character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body\"\n\n  n_prompt:\n    - \"worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative\"\n"
  },
  {
    "path": "configs/prompts/2_motionlora/2_motionlora_RealisticVision.yaml",
    "content": "# ZoomIn\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_ZoomIn.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# ZoomOut\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_ZoomOut.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# PanLeft\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_PanLeft.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# PanRight\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_PanRight.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# TiltUp\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_TiltUp.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# TiltDown\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_TiltDown.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# RollingAnticlockwise\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_RollingAnticlockwise.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n\n\n# RollingClockwise\n- inference_config: \"configs/inference/inference-v2.yaml\"\n  motion_module:    \"models/Motion_Module/mm_sd_v15_v2.ckpt\"\n\n  motion_module_lora_configs:\n    - path:  \"models/MotionLoRA/v2_lora_RollingClockwise.ckpt\"\n      alpha: 1.0\n\n  dreambooth_path: \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n  lora_model_path: \"\"\n\n  seed:           43242532350557906\n  steps:          25\n  guidance_scale: 7.5\n\n  prompt:\n    - \"photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3\"\n\n  n_prompt:\n    - \"blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation\"\n"
  },
  {
    "path": "configs/prompts/3_sparsectrl/3_1_sparsectrl_i2v.yaml",
    "content": "# 1-animation\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:   \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/latent_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt\"\n\n  H: 256\n  W: 384\n  seed: [123,234]\n  steps: 25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0]\n  controlnet_images:\n    - \"__assets__/demos/image/painting.png\"\n\n  prompt:\n    - an oil painting of a sailboat in the ocean wave\n    - an oil painting of a sailboat in the ocean wave\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n\n\n# 2-interpolation\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:   \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/latent_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt\"\n\n  H: 256\n  W: 384\n  seed: [123,234]\n  steps: 25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0,-1]\n  controlnet_images:\n    - \"__assets__/demos/image/interpolation_1.png\"\n    - \"__assets__/demos/image/interpolation_2.png\"\n\n  prompt:\n    - \"aerial view, beautiful forest, autumn, 4k, high quality\"\n    - \"aerial view, beautiful forest, autumn, 4k, high quality\"\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n\n\n# 3-interpolation\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:   \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/latent_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt\"\n\n  H: 256\n  W: 384\n  seed:           [123,234]\n  steps:          25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0,5,10,15]\n  controlnet_images:\n    - \"__assets__/demos/image/low_fps_1.png\"\n    - \"__assets__/demos/image/low_fps_2.png\"\n    - \"__assets__/demos/image/low_fps_3.png\"\n    - \"__assets__/demos/image/low_fps_4.png\"\n\n  prompt:\n    - \"two people holding hands in a field with wind turbines in the background\"\n    - \"two people holding hands in a field with wind turbines in the background\"\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n\n\n# 3-prediction\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:   \"\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/latent_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt\"\n\n  H: 256\n  W: 384\n  seed:           [123,234]\n  steps:          25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0,1,2,3]\n  controlnet_images:\n    - \"__assets__/demos/image/prediction_1.png\"\n    - \"__assets__/demos/image/prediction_2.png\"\n    - \"__assets__/demos/image/prediction_3.png\"\n    - \"__assets__/demos/image/prediction_4.png\"\n\n  prompt:\n    - \"an astronaut is flying in the space, 4k, high resolution\"\n    - \"an astronaut is flying in the space, 4k, high resolution\"\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n"
  },
  {
    "path": "configs/prompts/3_sparsectrl/3_2_sparsectrl_rgb_RealisticVision.yaml",
    "content": "# animation-1\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:   \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/latent_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt\"\n\n  seed: -1\n  steps: 25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0]\n  controlnet_images:\n    - \"__assets__/demos/image/RealisticVision_firework.png\"\n\n  prompt:\n    - \"closeup face photo of man in black clothes, night city street, bokeh, fireworks in background\"\n    - \"closeup face photo of man in black clothes, night city street, bokeh, fireworks in background\"\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n\n\n# animation-2\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:   \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/latent_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_rgb.ckpt\"\n\n  seed: -1\n  steps: 25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0]\n  controlnet_images:\n    - \"__assets__/demos/image/RealisticVision_sunset.png\"\n\n  prompt:\n    - \"masterpiece, bestquality, highlydetailed, ultradetailed, sunset, orange sky, warm lighting, fishing boats, ocean waves, seagulls, rippling water, wharf, silhouette, serene atmosphere, dusk, evening glow, golden hour, coastal landscape, seaside scenery\"\n    - \"masterpiece, bestquality, highlydetailed, ultradetailed, sunset, orange sky, warm lighting, fishing boats, ocean waves, seagulls, rippling water, wharf, silhouette, serene atmosphere, dusk, evening glow, golden hour, coastal landscape, seaside scenery\"\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n"
  },
  {
    "path": "configs/prompts/3_sparsectrl/3_3_sparsectrl_sketch_RealisticVision.yaml",
    "content": "# 1-sketch-to-video\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:    \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/image_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_scribble.ckpt\"\n\n  seed: -1\n  steps: 25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0]\n  controlnet_images:\n    - \"__assets__/demos/scribble/scribble_1.png\"\n\n  prompt:\n    - \"a back view of a boy, standing on the ground, looking at the sky, sunlight, masterpieces\"\n    - \"a back view of a boy, standing on the ground, looking at the sky, clouds, sunset, orange sky, beautiful sunlight, masterpieces\"\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n\n\n# 2-storyboarding\n- adapter_lora_scale: 1.0\n  adapter_lora_path: \"models/Motion_Module/v3_sd15_adapter.ckpt\"\n  dreambooth_path:    \"models/DreamBooth_LoRA/realisticVisionV60B1_v51VAE.safetensors\"\n\n  inference_config: \"configs/inference/inference-v3.yaml\"\n  motion_module:    \"models/Motion_Module/v3_sd15_mm.ckpt\"\n\n  controlnet_config: \"configs/inference/sparsectrl/image_condition.yaml\"\n  controlnet_path:   \"models/SparseCtrl/v3_sd15_sparsectrl_scribble.ckpt\"\n\n  seed: -1\n  steps: 25\n  guidance_scale: 8.5\n\n  controlnet_image_indexs: [0,8,15]\n  controlnet_images:\n    - \"__assets__/demos/scribble/scribble_2_1.png\"\n    - \"__assets__/demos/scribble/scribble_2_2.png\"\n    - \"__assets__/demos/scribble/scribble_2_3.png\"\n\n  prompt:\n    - \"an aerial view of a modern city, sunlight, day time, masterpiece, high quality\"\n    - \"an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality\"\n  n_prompt:\n    - \"worst quality, low quality, letterboxed\"\n"
  },
  {
    "path": "configs/training/v1/image_finetune.yaml",
    "content": "image_finetune: true\n\noutput_dir: \"outputs\"\npretrained_model_path: \"models/StableDiffusion/stable-diffusion-v1-5\"\n\nnoise_scheduler_kwargs:\n  num_train_timesteps: 1000\n  beta_start:          0.00085\n  beta_end:            0.012\n  beta_schedule:       \"scaled_linear\"\n  steps_offset:        1\n  clip_sample:         false\n\ntrain_data:\n  csv_path:     \"path_to_csv_file\"\n  video_folder: \"path_to_video_foler\"\n  sample_size:  256\n\nvalidation_data:\n  prompts:\n    - \"Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons.\"\n    - \"A drone view of celebration with Christma tree and fireworks, starry sky - background.\"\n    - \"Robot dancing in times square.\"\n    - \"Pacific coast, carmel by the sea ocean and waves.\"\n  num_inference_steps: 25\n  guidance_scale: 8.\n\ntrainable_modules:\n  - \".\"\n\nunet_checkpoint_path: \"\"\n\nlearning_rate:    1.e-5\ntrain_batch_size: 50\n\nmax_train_epoch:      -1\nmax_train_steps:      100\ncheckpointing_epochs: -1\ncheckpointing_steps:  60\n\nvalidation_steps:       5000\nvalidation_steps_tuple: [2, 50]\n\nglobal_seed: 42\nmixed_precision_training: true\nenable_xformers_memory_efficient_attention: True\n\nis_debug: False\n"
  },
  {
    "path": "configs/training/v1/training.yaml",
    "content": "image_finetune: false\n\noutput_dir: \"outputs\"\npretrained_model_path: \"models/StableDiffusion/stable-diffusion-v1-5\"\n\nunet_additional_kwargs:\n  use_motion_module              : true\n  motion_module_resolutions      : [ 1,2,4,8 ]\n  unet_use_cross_frame_attention : false\n  unet_use_temporal_attention    : false\n\n  motion_module_type: Vanilla\n  motion_module_kwargs:\n    num_attention_heads                : 8\n    num_transformer_block              : 1\n    attention_block_types              : [ \"Temporal_Self\", \"Temporal_Self\" ]\n    temporal_position_encoding         : true\n    temporal_position_encoding_max_len : 24\n    temporal_attention_dim_div         : 1\n    zero_initialize                    : true\n\nnoise_scheduler_kwargs:\n  num_train_timesteps: 1000\n  beta_start:          0.00085\n  beta_end:            0.012\n  beta_schedule:       \"linear\"\n  steps_offset:        1\n  clip_sample:         false\n\ntrain_data:\n  csv_path:        \"path_to_csv_file\"\n  video_folder:    \"path_to_video_foler\"\n  sample_size:     256\n  sample_stride:   4\n  sample_n_frames: 16\n\nvalidation_data:\n  prompts:\n    - \"Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons.\"\n    - \"A drone view of celebration with Christma tree and fireworks, starry sky - background.\"\n    - \"Robot dancing in times square.\"\n    - \"Pacific coast, carmel by the sea ocean and waves.\"\n  num_inference_steps: 25\n  guidance_scale: 8.\n\ntrainable_modules:\n  - \"motion_modules.\"\n\nunet_checkpoint_path: \"\"\n\nlearning_rate:    1.e-4\ntrain_batch_size: 4\n\nmax_train_epoch:      -1\nmax_train_steps:      100\ncheckpointing_epochs: -1\ncheckpointing_steps:  60\n\nvalidation_steps:       5000\nvalidation_steps_tuple: [2, 50]\n\nglobal_seed: 42\nmixed_precision_training: true\nenable_xformers_memory_efficient_attention: True\n\nis_debug: False\n"
  },
  {
    "path": "requirements.txt",
    "content": "torch==2.3.1\ntorchvision==0.18.1\ndiffusers==0.11.1\ntransformers==4.25.1\nxformers==0.0.27\nimageio==2.27.0\nimageio-ffmpeg==0.4.9\ndecord==0.6.0\nomegaconf==2.3.0\ngradio==3.36.1\nsafetensors\neinops\nwandb\n"
  },
  {
    "path": "scripts/animate.py",
    "content": "import argparse\nimport datetime\nimport inspect\nimport os\nfrom omegaconf import OmegaConf\n\nimport torch\nimport torchvision.transforms as transforms\n\nimport diffusers\nfrom diffusers import AutoencoderKL, DDIMScheduler\n\nfrom tqdm.auto import tqdm\nfrom transformers import CLIPTextModel, CLIPTokenizer\n\nfrom animatediff.models.unet import UNet3DConditionModel\nfrom animatediff.models.sparse_controlnet import SparseControlNetModel\nfrom animatediff.pipelines.pipeline_animation import AnimationPipeline\nfrom animatediff.utils.util import save_videos_grid\nfrom animatediff.utils.util import load_weights, auto_download\nfrom diffusers.utils.import_utils import is_xformers_available\n\nfrom einops import rearrange, repeat\n\nimport csv, pdb, glob, math\nfrom pathlib import Path\nfrom PIL import Image\nimport numpy as np\n\n\n@torch.no_grad()\ndef main(args):\n    *_, func_args = inspect.getargvalues(inspect.currentframe())\n    func_args = dict(func_args)\n    \n    time_str = datetime.datetime.now().strftime(\"%Y-%m-%dT%H-%M-%S\")\n    savedir = f\"samples/{Path(args.config).stem}-{time_str}\"\n    os.makedirs(savedir)\n\n    config  = OmegaConf.load(args.config)\n    samples = []\n\n    # create validation pipeline\n    tokenizer    = CLIPTokenizer.from_pretrained(args.pretrained_model_path, subfolder=\"tokenizer\")\n    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_path, subfolder=\"text_encoder\").cuda()\n    vae          = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder=\"vae\").cuda()\n\n    sample_idx = 0\n    for model_idx, model_config in enumerate(config):\n        model_config.W = model_config.get(\"W\", args.W)\n        model_config.H = model_config.get(\"H\", args.H)\n        model_config.L = model_config.get(\"L\", args.L)\n\n        inference_config = OmegaConf.load(model_config.get(\"inference_config\", args.inference_config))\n        unet = UNet3DConditionModel.from_pretrained_2d(args.pretrained_model_path, subfolder=\"unet\", unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs)).cuda()\n\n        # load controlnet model\n        controlnet = controlnet_images = None\n        if model_config.get(\"controlnet_path\", \"\") != \"\":\n            assert model_config.get(\"controlnet_images\", \"\") != \"\"\n            assert model_config.get(\"controlnet_config\", \"\") != \"\"\n            \n            unet.config.num_attention_heads = 8\n            unet.config.projection_class_embeddings_input_dim = None\n\n            controlnet_config = OmegaConf.load(model_config.controlnet_config)\n            controlnet = SparseControlNetModel.from_unet(unet, controlnet_additional_kwargs=controlnet_config.get(\"controlnet_additional_kwargs\", {}))\n\n            auto_download(model_config.controlnet_path, is_dreambooth_lora=False)\n            print(f\"loading controlnet checkpoint from {model_config.controlnet_path} ...\")\n            controlnet_state_dict = torch.load(model_config.controlnet_path, map_location=\"cpu\")\n            controlnet_state_dict = controlnet_state_dict[\"controlnet\"] if \"controlnet\" in controlnet_state_dict else controlnet_state_dict\n            controlnet_state_dict = {name: param for name, param in controlnet_state_dict.items() if \"pos_encoder.pe\" not in name}\n            controlnet_state_dict.pop(\"animatediff_config\", \"\")\n            controlnet.load_state_dict(controlnet_state_dict)\n            controlnet.cuda()\n\n            image_paths = model_config.controlnet_images\n            if isinstance(image_paths, str): image_paths = [image_paths]\n\n            print(f\"controlnet image paths:\")\n            for path in image_paths: print(path)\n            assert len(image_paths) <= model_config.L\n\n            image_transforms = transforms.Compose([\n                transforms.RandomResizedCrop(\n                    (model_config.H, model_config.W), (1.0, 1.0), \n                    ratio=(model_config.W/model_config.H, model_config.W/model_config.H)\n                ),\n                transforms.ToTensor(),\n            ])\n\n            if model_config.get(\"normalize_condition_images\", False):\n                def image_norm(image):\n                    image = image.mean(dim=0, keepdim=True).repeat(3,1,1)\n                    image -= image.min()\n                    image /= image.max()\n                    return image\n            else: image_norm = lambda x: x\n                \n            controlnet_images = [image_norm(image_transforms(Image.open(path).convert(\"RGB\"))) for path in image_paths]\n\n            os.makedirs(os.path.join(savedir, \"control_images\"), exist_ok=True)\n            for i, image in enumerate(controlnet_images):\n                Image.fromarray((255. * (image.numpy().transpose(1,2,0))).astype(np.uint8)).save(f\"{savedir}/control_images/{i}.png\")\n\n            controlnet_images = torch.stack(controlnet_images).unsqueeze(0).cuda()\n            controlnet_images = rearrange(controlnet_images, \"b f c h w -> b c f h w\")\n\n            if controlnet.use_simplified_condition_embedding:\n                num_controlnet_images = controlnet_images.shape[2]\n                controlnet_images = rearrange(controlnet_images, \"b c f h w -> (b f) c h w\")\n                controlnet_images = vae.encode(controlnet_images * 2. - 1.).latent_dist.sample() * 0.18215\n                controlnet_images = rearrange(controlnet_images, \"(b f) c h w -> b c f h w\", f=num_controlnet_images)\n\n        # set xformers\n        if is_xformers_available() and (not args.without_xformers):\n            unet.enable_xformers_memory_efficient_attention()\n            if controlnet is not None: controlnet.enable_xformers_memory_efficient_attention()\n\n        pipeline = AnimationPipeline(\n            vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet,\n            controlnet=controlnet,\n            scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs)),\n        ).to(\"cuda\")\n\n        pipeline = load_weights(\n            pipeline,\n            # motion module\n            motion_module_path         = model_config.get(\"motion_module\", \"\"),\n            motion_module_lora_configs = model_config.get(\"motion_module_lora_configs\", []),\n            # domain adapter\n            adapter_lora_path          = model_config.get(\"adapter_lora_path\", \"\"),\n            adapter_lora_scale         = model_config.get(\"adapter_lora_scale\", 1.0),\n            # image layers\n            dreambooth_model_path      = model_config.get(\"dreambooth_path\", \"\"),\n            lora_model_path            = model_config.get(\"lora_model_path\", \"\"),\n            lora_alpha                 = model_config.get(\"lora_alpha\", 0.8),\n        ).to(\"cuda\")\n\n        prompts      = model_config.prompt\n        n_prompts    = list(model_config.n_prompt) * len(prompts) if len(model_config.n_prompt) == 1 else model_config.n_prompt\n        \n        random_seeds = model_config.get(\"seed\", [-1])\n        random_seeds = [random_seeds] if isinstance(random_seeds, int) else list(random_seeds)\n        random_seeds = random_seeds * len(prompts) if len(random_seeds) == 1 else random_seeds\n        \n        config[model_idx].random_seed = []\n        for prompt_idx, (prompt, n_prompt, random_seed) in enumerate(zip(prompts, n_prompts, random_seeds)):\n            \n            # manually set random seed for reproduction\n            if random_seed != -1: torch.manual_seed(random_seed)\n            else: torch.seed()\n            config[model_idx].random_seed.append(torch.initial_seed())\n            \n            print(f\"current seed: {torch.initial_seed()}\")\n            print(f\"sampling {prompt} ...\")\n            sample = pipeline(\n                prompt,\n                negative_prompt     = n_prompt,\n                num_inference_steps = model_config.steps,\n                guidance_scale      = model_config.guidance_scale,\n                width               = model_config.W,\n                height              = model_config.H,\n                video_length        = model_config.L,\n\n                controlnet_images = controlnet_images,\n                controlnet_image_index = model_config.get(\"controlnet_image_indexs\", [0]),\n            ).videos\n            samples.append(sample)\n\n            prompt = \"-\".join((prompt.replace(\"/\", \"\").split(\" \")[:10]))\n            save_videos_grid(sample, f\"{savedir}/sample/{sample_idx}-{prompt}.gif\")\n            print(f\"save to {savedir}/sample/{prompt}.gif\")\n            \n            sample_idx += 1\n\n    samples = torch.concat(samples)\n    save_videos_grid(samples, f\"{savedir}/sample.gif\", n_rows=4)\n\n    OmegaConf.save(config, f\"{savedir}/config.yaml\")\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--pretrained-model-path\", type=str, default=\"runwayml/stable-diffusion-v1-5\")\n    parser.add_argument(\"--inference-config\",      type=str, default=\"configs/inference/inference-v1.yaml\")    \n    parser.add_argument(\"--config\",                type=str, required=True)\n    \n    parser.add_argument(\"--L\", type=int, default=16 )\n    parser.add_argument(\"--W\", type=int, default=512)\n    parser.add_argument(\"--H\", type=int, default=512)\n\n    parser.add_argument(\"--without-xformers\", action=\"store_true\")\n\n    args = parser.parse_args()\n    main(args)\n"
  },
  {
    "path": "train.py",
    "content": "import os\nimport math\nimport wandb\nimport random\nimport logging\nimport inspect\nimport argparse\nimport datetime\nimport subprocess\n\nfrom pathlib import Path\nfrom tqdm.auto import tqdm\nfrom einops import rearrange\nfrom omegaconf import OmegaConf\nfrom safetensors import safe_open\nfrom typing import Dict, Optional, Tuple\n\nimport torch\nimport torchvision\nimport torch.nn.functional as F\nimport torch.distributed as dist\nfrom torch.optim.swa_utils import AveragedModel\nfrom torch.utils.data.distributed import DistributedSampler\nfrom torch.nn.parallel import DistributedDataParallel as DDP\n\nimport diffusers\nfrom diffusers import AutoencoderKL, DDIMScheduler\nfrom diffusers.models import UNet2DConditionModel\nfrom diffusers.pipelines import StableDiffusionPipeline\nfrom diffusers.optimization import get_scheduler\nfrom diffusers.utils import check_min_version\nfrom diffusers.utils.import_utils import is_xformers_available\n\nimport transformers\nfrom transformers import CLIPTextModel, CLIPTokenizer\n\nfrom animatediff.data.dataset import WebVid10M\nfrom animatediff.models.unet import UNet3DConditionModel\nfrom animatediff.pipelines.pipeline_animation import AnimationPipeline\nfrom animatediff.utils.util import save_videos_grid, zero_rank_print\n\n\n\ndef init_dist(launcher=\"slurm\", backend='nccl', port=29500, **kwargs):\n    \"\"\"Initializes distributed environment.\"\"\"\n    if launcher == 'pytorch':\n        rank = int(os.environ['RANK'])\n        num_gpus = torch.cuda.device_count()\n        local_rank = rank % num_gpus\n        torch.cuda.set_device(local_rank)\n        dist.init_process_group(backend=backend, **kwargs)\n        \n    elif launcher == 'slurm':\n        proc_id = int(os.environ['SLURM_PROCID'])\n        ntasks = int(os.environ['SLURM_NTASKS'])\n        node_list = os.environ['SLURM_NODELIST']\n        num_gpus = torch.cuda.device_count()\n        local_rank = proc_id % num_gpus\n        torch.cuda.set_device(local_rank)\n        addr = subprocess.getoutput(\n            f'scontrol show hostname {node_list} | head -n1')\n        os.environ['MASTER_ADDR'] = addr\n        os.environ['WORLD_SIZE'] = str(ntasks)\n        os.environ['RANK'] = str(proc_id)\n        port = os.environ.get('PORT', port)\n        os.environ['MASTER_PORT'] = str(port)\n        dist.init_process_group(backend=backend)\n        zero_rank_print(f\"proc_id: {proc_id}; local_rank: {local_rank}; ntasks: {ntasks}; node_list: {node_list}; num_gpus: {num_gpus}; addr: {addr}; port: {port}\")\n        \n    else:\n        raise NotImplementedError(f'Not implemented launcher type: `{launcher}`!')\n    \n    return local_rank\n\n\n\ndef main(\n    image_finetune: bool,\n    \n    name: str,\n    use_wandb: bool,\n    launcher: str,\n    \n    output_dir: str,\n    pretrained_model_path: str,\n\n    train_data: Dict,\n    validation_data: Dict,\n    cfg_random_null_text: bool = True,\n    cfg_random_null_text_ratio: float = 0.1,\n    \n    unet_checkpoint_path: str = \"\",\n    unet_additional_kwargs: Dict = {},\n    ema_decay: float = 0.9999,\n    noise_scheduler_kwargs = None,\n    \n    max_train_epoch: int = -1,\n    max_train_steps: int = 100,\n    validation_steps: int = 100,\n    validation_steps_tuple: Tuple = (-1,),\n\n    learning_rate: float = 3e-5,\n    scale_lr: bool = False,\n    lr_warmup_steps: int = 0,\n    lr_scheduler: str = \"constant\",\n\n    trainable_modules: Tuple[str] = (None, ),\n    num_workers: int = 32,\n    train_batch_size: int = 1,\n    adam_beta1: float = 0.9,\n    adam_beta2: float = 0.999,\n    adam_weight_decay: float = 1e-2,\n    adam_epsilon: float = 1e-08,\n    max_grad_norm: float = 1.0,\n    gradient_accumulation_steps: int = 1,\n    gradient_checkpointing: bool = False,\n    checkpointing_epochs: int = 5,\n    checkpointing_steps: int = -1,\n\n    mixed_precision_training: bool = True,\n    enable_xformers_memory_efficient_attention: bool = True,\n\n    global_seed: int = 42,\n    is_debug: bool = False,\n):\n    check_min_version(\"0.10.0.dev0\")\n\n    # Initialize distributed training\n    local_rank      = init_dist(launcher=launcher)\n    global_rank     = dist.get_rank()\n    num_processes   = dist.get_world_size()\n    is_main_process = global_rank == 0\n\n    seed = global_seed + global_rank\n    torch.manual_seed(seed)\n    \n    # Logging folder\n    folder_name = \"debug\" if is_debug else name + datetime.datetime.now().strftime(\"-%Y-%m-%dT%H-%M-%S\")\n    output_dir = os.path.join(output_dir, folder_name)\n    if is_debug and os.path.exists(output_dir):\n        os.system(f\"rm -rf {output_dir}\")\n\n    *_, config = inspect.getargvalues(inspect.currentframe())\n\n    # Make one log on every process with the configuration for debugging.\n    logging.basicConfig(\n        format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n        datefmt=\"%m/%d/%Y %H:%M:%S\",\n        level=logging.INFO,\n    )\n\n    if is_main_process and (not is_debug) and use_wandb:\n        run = wandb.init(project=\"animatediff\", name=folder_name, config=config)\n\n    # Handle the output folder creation\n    if is_main_process:\n        os.makedirs(output_dir, exist_ok=True)\n        os.makedirs(f\"{output_dir}/samples\", exist_ok=True)\n        os.makedirs(f\"{output_dir}/sanity_check\", exist_ok=True)\n        os.makedirs(f\"{output_dir}/checkpoints\", exist_ok=True)\n        OmegaConf.save(config, os.path.join(output_dir, 'config.yaml'))\n\n    # Load scheduler, tokenizer and models.\n    noise_scheduler = DDIMScheduler(**OmegaConf.to_container(noise_scheduler_kwargs))\n\n    vae          = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder=\"vae\")\n    tokenizer    = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder=\"tokenizer\")\n    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder=\"text_encoder\")\n    if not image_finetune:\n        unet = UNet3DConditionModel.from_pretrained_2d(\n            pretrained_model_path, subfolder=\"unet\", \n            unet_additional_kwargs=OmegaConf.to_container(unet_additional_kwargs)\n        )\n    else:\n        unet = UNet2DConditionModel.from_pretrained(pretrained_model_path, subfolder=\"unet\")\n        \n    # Load pretrained unet weights\n    if unet_checkpoint_path != \"\":\n        zero_rank_print(f\"from checkpoint: {unet_checkpoint_path}\")\n        unet_checkpoint_path = torch.load(unet_checkpoint_path, map_location=\"cpu\")\n        if \"global_step\" in unet_checkpoint_path: zero_rank_print(f\"global_step: {unet_checkpoint_path['global_step']}\")\n        state_dict = unet_checkpoint_path[\"state_dict\"] if \"state_dict\" in unet_checkpoint_path else unet_checkpoint_path\n\n        m, u = unet.load_state_dict(state_dict, strict=False)\n        zero_rank_print(f\"missing keys: {len(m)}, unexpected keys: {len(u)}\")\n        assert len(u) == 0\n        \n    # Freeze vae and text_encoder\n    vae.requires_grad_(False)\n    text_encoder.requires_grad_(False)\n    \n    # Set unet trainable parameters\n    unet.requires_grad_(False)\n    for name, param in unet.named_parameters():\n        for trainable_module_name in trainable_modules:\n            if trainable_module_name in name:\n                param.requires_grad = True\n                break\n            \n    trainable_params = list(filter(lambda p: p.requires_grad, unet.parameters()))\n    optimizer = torch.optim.AdamW(\n        trainable_params,\n        lr=learning_rate,\n        betas=(adam_beta1, adam_beta2),\n        weight_decay=adam_weight_decay,\n        eps=adam_epsilon,\n    )\n\n    if is_main_process:\n        zero_rank_print(f\"trainable params number: {len(trainable_params)}\")\n        zero_rank_print(f\"trainable params scale: {sum(p.numel() for p in trainable_params) / 1e6:.3f} M\")\n\n    # Enable xformers\n    if enable_xformers_memory_efficient_attention:\n        if is_xformers_available():\n            unet.enable_xformers_memory_efficient_attention()\n        else:\n            raise ValueError(\"xformers is not available. Make sure it is installed correctly\")\n\n    # Enable gradient checkpointing\n    if gradient_checkpointing:\n        unet.enable_gradient_checkpointing()\n\n    # Move models to GPU\n    vae.to(local_rank)\n    text_encoder.to(local_rank)\n\n    # Get the training dataset\n    train_dataset = WebVid10M(**train_data, is_image=image_finetune)\n    distributed_sampler = DistributedSampler(\n        train_dataset,\n        num_replicas=num_processes,\n        rank=global_rank,\n        shuffle=True,\n        seed=global_seed,\n    )\n\n    # DataLoaders creation:\n    train_dataloader = torch.utils.data.DataLoader(\n        train_dataset,\n        batch_size=train_batch_size,\n        shuffle=False,\n        sampler=distributed_sampler,\n        num_workers=num_workers,\n        pin_memory=True,\n        drop_last=True,\n    )\n\n    # Get the training iteration\n    if max_train_steps == -1:\n        assert max_train_epoch != -1\n        max_train_steps = max_train_epoch * len(train_dataloader)\n        \n    if checkpointing_steps == -1:\n        assert checkpointing_epochs != -1\n        checkpointing_steps = checkpointing_epochs * len(train_dataloader)\n\n    if scale_lr:\n        learning_rate = (learning_rate * gradient_accumulation_steps * train_batch_size * num_processes)\n\n    # Scheduler\n    lr_scheduler = get_scheduler(\n        lr_scheduler,\n        optimizer=optimizer,\n        num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,\n        num_training_steps=max_train_steps * gradient_accumulation_steps,\n    )\n\n    # Validation pipeline\n    if not image_finetune:\n        validation_pipeline = AnimationPipeline(\n            unet=unet, vae=vae, tokenizer=tokenizer, text_encoder=text_encoder, scheduler=noise_scheduler,\n        ).to(\"cuda\")\n    else:\n        validation_pipeline = StableDiffusionPipeline.from_pretrained(\n            pretrained_model_path,\n            unet=unet, vae=vae, tokenizer=tokenizer, text_encoder=text_encoder, scheduler=noise_scheduler, safety_checker=None,\n        )\n    validation_pipeline.enable_vae_slicing()\n\n    # DDP warpper\n    unet.to(local_rank)\n    unet = DDP(unet, device_ids=[local_rank], output_device=local_rank)\n\n    # We need to recalculate our total training steps as the size of the training dataloader may have changed.\n    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)\n    # Afterwards we recalculate our number of training epochs\n    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)\n\n    # Train!\n    total_batch_size = train_batch_size * num_processes * gradient_accumulation_steps\n\n    if is_main_process:\n        logging.info(\"***** Running training *****\")\n        logging.info(f\"  Num examples = {len(train_dataset)}\")\n        logging.info(f\"  Num Epochs = {num_train_epochs}\")\n        logging.info(f\"  Instantaneous batch size per device = {train_batch_size}\")\n        logging.info(f\"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}\")\n        logging.info(f\"  Gradient Accumulation steps = {gradient_accumulation_steps}\")\n        logging.info(f\"  Total optimization steps = {max_train_steps}\")\n    global_step = 0\n    first_epoch = 0\n\n    # Only show the progress bar once on each machine.\n    progress_bar = tqdm(range(global_step, max_train_steps), disable=not is_main_process)\n    progress_bar.set_description(\"Steps\")\n\n    # Support mixed-precision training\n    scaler = torch.cuda.amp.GradScaler() if mixed_precision_training else None\n\n    for epoch in range(first_epoch, num_train_epochs):\n        train_dataloader.sampler.set_epoch(epoch)\n        unet.train()\n        \n        for step, batch in enumerate(train_dataloader):\n            if cfg_random_null_text:\n                batch['text'] = [name if random.random() > cfg_random_null_text_ratio else \"\" for name in batch['text']]\n                \n            # Data batch sanity check\n            if epoch == first_epoch and step == 0:\n                pixel_values, texts = batch['pixel_values'].cpu(), batch['text']\n                if not image_finetune:\n                    pixel_values = rearrange(pixel_values, \"b f c h w -> b c f h w\")\n                    for idx, (pixel_value, text) in enumerate(zip(pixel_values, texts)):\n                        pixel_value = pixel_value[None, ...]\n                        save_videos_grid(pixel_value, f\"{output_dir}/sanity_check/{'-'.join(text.replace('/', '').split()[:10]) if not text == '' else f'{global_rank}-{idx}'}.gif\", rescale=True)\n                else:\n                    for idx, (pixel_value, text) in enumerate(zip(pixel_values, texts)):\n                        pixel_value = pixel_value / 2. + 0.5\n                        torchvision.utils.save_image(pixel_value, f\"{output_dir}/sanity_check/{'-'.join(text.replace('/', '').split()[:10]) if not text == '' else f'{global_rank}-{idx}'}.png\")\n                    \n            ### >>>> Training >>>> ###\n            \n            # Convert videos to latent space            \n            pixel_values = batch[\"pixel_values\"].to(local_rank)\n            video_length = pixel_values.shape[1]\n            with torch.no_grad():\n                if not image_finetune:\n                    pixel_values = rearrange(pixel_values, \"b f c h w -> (b f) c h w\")\n                    latents = vae.encode(pixel_values).latent_dist\n                    latents = latents.sample()\n                    latents = rearrange(latents, \"(b f) c h w -> b c f h w\", f=video_length)\n                else:\n                    latents = vae.encode(pixel_values).latent_dist\n                    latents = latents.sample()\n\n                latents = latents * 0.18215\n\n            # Sample noise that we'll add to the latents\n            noise = torch.randn_like(latents)\n            bsz = latents.shape[0]\n            \n            # Sample a random timestep for each video\n            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)\n            timesteps = timesteps.long()\n            \n            # Add noise to the latents according to the noise magnitude at each timestep\n            # (this is the forward diffusion process)\n            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)\n            \n            # Get the text embedding for conditioning\n            with torch.no_grad():\n                prompt_ids = tokenizer(\n                    batch['text'], max_length=tokenizer.model_max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\"\n                ).input_ids.to(latents.device)\n                encoder_hidden_states = text_encoder(prompt_ids)[0]\n                \n            # Get the target for loss depending on the prediction type\n            if noise_scheduler.config.prediction_type == \"epsilon\":\n                target = noise\n            elif noise_scheduler.config.prediction_type == \"v_prediction\":\n                raise NotImplementedError\n            else:\n                raise ValueError(f\"Unknown prediction type {noise_scheduler.config.prediction_type}\")\n\n            # Predict the noise residual and compute loss\n            # Mixed-precision training\n            with torch.cuda.amp.autocast(enabled=mixed_precision_training):\n                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample\n                loss = F.mse_loss(model_pred.float(), target.float(), reduction=\"mean\")\n\n            optimizer.zero_grad()\n\n            # Backpropagate\n            if mixed_precision_training:\n                scaler.scale(loss).backward()\n                \"\"\" >>> gradient clipping >>> \"\"\"\n                scaler.unscale_(optimizer)\n                torch.nn.utils.clip_grad_norm_(unet.parameters(), max_grad_norm)\n                \"\"\" <<< gradient clipping <<< \"\"\"\n                scaler.step(optimizer)\n                scaler.update()\n            else:\n                loss.backward()\n                \"\"\" >>> gradient clipping >>> \"\"\"\n                torch.nn.utils.clip_grad_norm_(unet.parameters(), max_grad_norm)\n                \"\"\" <<< gradient clipping <<< \"\"\"\n                optimizer.step()\n\n            lr_scheduler.step()\n            progress_bar.update(1)\n            global_step += 1\n            \n            ### <<<< Training <<<< ###\n            \n            # Wandb logging\n            if is_main_process and (not is_debug) and use_wandb:\n                wandb.log({\"train_loss\": loss.item()}, step=global_step)\n                \n            # Save checkpoint\n            if is_main_process and (global_step % checkpointing_steps == 0 or step == len(train_dataloader) - 1):\n                save_path = os.path.join(output_dir, f\"checkpoints\")\n                state_dict = {\n                    \"epoch\": epoch,\n                    \"global_step\": global_step,\n                    \"state_dict\": unet.state_dict(),\n                }\n                if step == len(train_dataloader) - 1:\n                    torch.save(state_dict, os.path.join(save_path, f\"checkpoint-epoch-{epoch+1}.ckpt\"))\n                else:\n                    torch.save(state_dict, os.path.join(save_path, f\"checkpoint.ckpt\"))\n                logging.info(f\"Saved state to {save_path} (global_step: {global_step})\")\n                \n            # Periodically validation\n            if is_main_process and (global_step % validation_steps == 0 or global_step in validation_steps_tuple):\n                samples = []\n                \n                generator = torch.Generator(device=latents.device)\n                generator.manual_seed(global_seed)\n                \n                height = train_data.sample_size[0] if not isinstance(train_data.sample_size, int) else train_data.sample_size\n                width  = train_data.sample_size[1] if not isinstance(train_data.sample_size, int) else train_data.sample_size\n\n                prompts = validation_data.prompts[:2] if global_step < 1000 and (not image_finetune) else validation_data.prompts\n\n                for idx, prompt in enumerate(prompts):\n                    if not image_finetune:\n                        sample = validation_pipeline(\n                            prompt,\n                            generator    = generator,\n                            video_length = train_data.sample_n_frames,\n                            height       = height,\n                            width        = width,\n                            **validation_data,\n                        ).videos\n                        save_videos_grid(sample, f\"{output_dir}/samples/sample-{global_step}/{idx}.gif\")\n                        samples.append(sample)\n                        \n                    else:\n                        sample = validation_pipeline(\n                            prompt,\n                            generator           = generator,\n                            height              = height,\n                            width               = width,\n                            num_inference_steps = validation_data.get(\"num_inference_steps\", 25),\n                            guidance_scale      = validation_data.get(\"guidance_scale\", 8.),\n                        ).images[0]\n                        sample = torchvision.transforms.functional.to_tensor(sample)\n                        samples.append(sample)\n                \n                if not image_finetune:\n                    samples = torch.concat(samples)\n                    save_path = f\"{output_dir}/samples/sample-{global_step}.gif\"\n                    save_videos_grid(samples, save_path)\n                    \n                else:\n                    samples = torch.stack(samples)\n                    save_path = f\"{output_dir}/samples/sample-{global_step}.png\"\n                    torchvision.utils.save_image(samples, save_path, nrow=4)\n\n                logging.info(f\"Saved samples to {save_path}\")\n                \n            logs = {\"step_loss\": loss.detach().item(), \"lr\": lr_scheduler.get_last_lr()[0]}\n            progress_bar.set_postfix(**logs)\n            \n            if global_step >= max_train_steps:\n                break\n            \n    dist.destroy_process_group()\n\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--config\",   type=str, required=True)\n    parser.add_argument(\"--launcher\", type=str, choices=[\"pytorch\", \"slurm\"], default=\"pytorch\")\n    parser.add_argument(\"--wandb\",    action=\"store_true\")\n    args = parser.parse_args()\n\n    name   = Path(args.config).stem\n    config = OmegaConf.load(args.config)\n\n    main(name=name, launcher=args.launcher, use_wandb=args.wandb, **config)\n"
  }
]