[
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\">\n\n<h1 align=\"center\">Vlogger: Make Your Dream A Vlog</h1>\n</a>\n\n[Shaobin Zhuang](https://github.com/zhuangshaobin), [Kunchang Li](https://scholar.google.com/citations?user=D4tLSbsAAAAJ), [Xinyuan Chen†](https://scholar.google.com/citations?user=3fWSC8YAAAAJ), [Yaohui Wang†](https://scholar.google.com/citations?user=R7LyAb4AAAAJ), [Ziwei Liu](https://scholar.google.com/citations?user=lc45xlcAAAAJ), [Yu Qiao](https://scholar.google.com/citations?user=gFtI-8QAAAAJ&hl), [Yali Wang†](https://scholar.google.com/citations?user=hD948dkAAAAJ)\n\n[![arXiv](https://img.shields.io/badge/arXiv-2401.09414-b31b1b.svg)](https://arxiv.org/abs/2401.09414)\n[![Project Page](https://img.shields.io/badge/Vlogger-Website-green)](https://zhuangshaobin.github.io/Vlogger.github.io/)\n[![Hugging Face Model](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/GrayShine/Vlogger)\n[![Hugging Face Space](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/GrayShine/Vlogger-ShowMaker)\n[![YouTube Video](https://img.shields.io/badge/YouTube-Video-red)](https://youtu.be/ZRD1-jHbEGk)\n\n</div>\n</div>\n\n\nIn this work, we present **Vlogger**, a generic AI system for generating a **minute**-level video blog (i.e., vlog) of user descriptions. Different from short videos with a few seconds, vlog often contains a complex storyline with diversified scenes, which is challenging for most existing video generation approaches. To break through this bottleneck, our Vlogger smartly leverages Large Language Model (LLM) as Director and decomposes a long video generation task of vlog into four key stages, where we invoke various foundation models to play the critical roles of vlog professionals, including (1) Script, (2) Actor, (3) ShowMaker, and (4) Voicer. With such a design of mimicking human beings, our Vlogger can generate vlogs through explainable cooperation of top-down planning and bottom-up shooting. Moreover, we introduce a novel video diffusion model, **ShowMaker**, which serves as a videographer in our Vlogger for generating the video snippet of each shooting scene. By incorporating Script and Actor attentively as textual and visual prompts, it can effectively enhance spatial-temporal coherence in the snippet. Besides, we design a concise mixed training paradigm for ShowMaker, boosting its capacity for both T2V generation and prediction. Finally, the extensive experiments show that our method achieves state-of-the-art performance on zero-shot T2V generation and prediction tasks. More importantly, Vlogger can generate over 5-minute vlogs from open-world descriptions, without loss of video coherence on script and actor.\n\n\n<div align=\"center\">\n<video src=\"https://github.com/zhuangshaobin/Vlogger/assets/94739615/1e8dd246-d3b9-49e9-8eee-d40b6d8523b9\" controls=\"controls\" width=\"500\" height=\"300\"></video>\n<b>A compressed version of generated <a href=\"https://youtu.be/ZRD1-jHbEGk\">Teddy Travel</a>.</b>\n</div>\n\n## Usage\n\n<details>\n  <summary><h3>Setup</h3></summary>\n\n<h4>Prepare Environment</h4>\n\n```bash\nconda create -n vlogger python==3.10.11\nconda activate vlogger\npip install -r requirements.txt\n```\n\n<h4>Download our model and T2I base model</h4>\n\nOur model is based on Stable diffusion v1.4, you may download [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) and [OpenCLIP-ViT-H-14](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K) to the director of ``` pretrained ```\n.\nDownload our model(ShowMaker) checkpoint (from [google drive](https://drive.google.com/file/d/1pAH73kz2QRfD2Dxk4lL3SrHvLAlWcPI3/view?usp=drive_link) or [hugging face](https://huggingface.co/GrayShine/Vlogger/tree/main)) and save to the directory of ```pretrained```\n\n\nNow under `./pretrained`, you should be able to see the following:\n```\n├── pretrained\n│   ├── ShowMaker.pt\n│   ├── stable-diffusion-v1-4\n│   ├── OpenCLIP-ViT-H-14\n│   │   ├── ...\n└── └── ├── ...\n        ├── ...\n```\n\n</details>\n\n\n<details>\n  <summary><h3>Inference for LLM planning and make reference image</h3></summary>\n        \nRun the following command to get script, actors and protagonist:\n\n```python\npython sample_scripts/vlog_write_script.py\n```\n\n- The generated scripts will be saved in ```results/vlog/$your_story_dir/script```.\n\n- The generated reference images will be saved in ```results/vlog/$your_story_dir/img```.\n\n- :warning: Enter your openai key in the 7th line of the file ```vlogger/planning_utils/gpt4_utils.py```\n</details>\n\n<details>\n  <summary><h3>Inference for vlog generation</h3></summary>\n        \nRun the following command to get the vlog:\n\n```python\npython sample_scripts/vlog_read_script_sample.py\n```\n\n- The generated scripts will be saved in ```results/vlog/$your_story_dir/video```.\n</details>\n\n<details>\n  <summary><h3>Inference for (T+I)2V </h3></summary>\n        \nRun the following command to get the (T+I)2V results:\n\n```python\npython sample_scripts/with_mask_sample.py\n```\n\n- The generated video will be saved in ```results/mask_no_ref```.\n</details>\n\n<details>\n  <summary><h3>Inference for (T+I+Ref)2V</h3></summary>\n        \nRun the following command to get the (T+I+Ref)2V results:\n\n```python\npython sample_scripts/with_mask_ref_sample.py\n```\n\n- The generated video will be saved in ```results/mask_ref```.\n</details>\n\n<details>\n  <summary><h3>More Details</h3></summary>\n        \nYou may modify ```configs/with_mask_sample.yaml``` to change the (T+I)2V conditions and modify ```configs/with_mask_ref_sample.yaml``` to change the (T+I+Ref)2V conditions.\nFor example:\n\n- ```ckpt``` is used to specify a model checkpoint.\n\n- ```text_prompt``` is used to describe the content of the video.\n\n- ```input_path``` is used to specify the path to the image.\n\n- ```ref_path``` is used to specify the path to the reference image.\n\n- ```save_path``` is used to specify the path to the generated video.\n</details>\n\n\n\n## Results\n### (T+Ref)2V Results\n<table class=\"center\">\n<tr>\n  <td style=\"text-align:center;width: 50%\" colspan=\"1\"><b>Reference Image</b></td>\n  <td style=\"text-align:center;width: 50%\" colspan=\"1\"><b>Output Video</b></td>\n</tr>\n<tr>\n  <td><img src=\"examples/TR2V/image/Egyptian_Pyramids.png\" width=\"250\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Scene Reference\n      </div> -->\n      <p align=\"center\">Scene Reference</p>\n  </td>\n  <td>\n      <img src=\"examples/TR2V/video/Fireworks_explode_over_the_pyramids.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Fireworks explode over the pyramids.\n      </div> -->\n          <p align=\"center\">Fireworks explode over the pyramids.</p>\n  </td>\n</tr>\n\n<tr>\n  <td><img src=\"examples/TR2V/image/Great_Wall.png\" width=\"250\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Scene Reference\n      </div> -->\n      <p align=\"center\">Scene Reference</p>\n  </td>\n  <td>\n      <img src=\"examples/TR2V/video/The_Great_Wall_burning_with_raging_fire.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        The Great Wall burning with raging fire.\n      </div> -->\n          <p align=\"center\">The Great Wall burning with raging fire.</p>\n  </td>\n</tr>\n\n<tr>\n  <td><img src=\"examples/TR2V/image/a_green_cat.png\" width=\"250\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Object Reference\n      </div> -->\n      <p align=\"center\">Object Reference</p>\n  </td>\n  <td>\n      <img src=\"examples/TR2V/video/A_cat_is_running_on_the_beach.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        A cat is running on the beach.\n      </div> -->\n          <p align=\"center\">A cat is running on the beach.</p>\n  </td>\n</tr>\n\n</table>\n\n### (T+I)2V Results\n<table class=\"center\">\n<tr>\n  <td style=\"text-align:center;width: 50%\" colspan=\"1\"><b>Input Image</b></td>\n  <td style=\"text-align:center;width: 50%\" colspan=\"1\"><b>Output Video</b></td>\n</tr>\n<tr>\n  <td><img src=\"input/i2v/Underwater_environment_cosmetic_bottles.png\" width=\"400\"></td>\n  <td>\n      <img src=\"examples/TI2V/Underwater_environment_cosmetic_bottles.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Underwater environment cosmetic bottles.\n      </div> -->\n          <p align=\"center\">Underwater environment cosmetic bottles.</p>\n  </td>\n</tr>\n\n<tr>\n  <td><img src=\"input/i2v/A_big_drop_of_water_falls_on_a_rose_petal.png\" width=\"400\"></td>\n  <td>\n      <img src=\"examples/TI2V/A_big_drop_of_water_falls_on_a_rose_petal.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        A big drop of water falls on a rose petal.\n      </div> -->\n          <p align=\"center\">A big drop of water falls on a rose petal.</p>\n  </td>\n</tr>\n\n<tr>\n  <td><img src=\"input/i2v/A_fish_swims_past_an_oriental_woman.png\" width=\"400\"></td>\n  <td>\n      <img src=\"examples/TI2V/A_fish_swims_past_an_oriental_woman.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        A fish swims past an oriental woman.\n      </div> -->\n          <p align=\"center\">A fish swims past an oriental woman.</p>\n  </td>\n</tr>\n\n<tr>\n  <td><img src=\"input/i2v/Cinematic_photograph_View_of_piloting_aaero.png\" width=\"400\"></td>\n  <td>\n      <img src=\"examples/TI2V/Cinematic_photograph_View_of_piloting_aaero.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Cinematic photograph. View of piloting aaero.\n      </div> -->\n          <p align=\"center\">Cinematic photograph. View of piloting aaero.</p>\n  </td>\n</tr>\n\n<tr>\n  <td><img src=\"input/i2v/Planet_hits_earth.png\" width=\"400\"></td>\n  <td>\n      <img src=\"examples/TI2V/Planet_hits_earth.gif\" width=\"400\">\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Planet hits earth.\n      </div> -->\n          <p align=\"center\">Planet hits earth.</p>\n  </td>\n</tr>\n</table>\n\n\n### T2V Results\n<table>\n<tr>\n  <td style=\"text-align:center;width: 66%\" colspan=\"2\"><b>Output Video</b></td>\n</tr>\n<tr>\n  <td>\n      <img src=\"examples/T2V/A_deer_looks_at_the_sunset_behind_him.gif\"/>\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        A deer looks at the sunset behind him.\n      </div> -->\n          <p align=\"center\">A deer looks at the sunset behind him.</p>\n  </td>\n  <td>\n      <img src=\"examples/T2V/A_duck_is_teaching_math_to_another_duck.gif\"/>\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        A duck is teaching math to another duck.\n      </div> -->\n          <p align=\"center\">A duck is teaching math to another duck.</p>\n  </td>\n</tr>\n<tr>\n  <td>\n      <img src=\"examples/T2V/Bezos_explores_tropical_rainforest.gif\"/>\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Bezos explores tropical rainforest.\n      </div> -->\n          <p align=\"center\">Bezos explores tropical rainforest.</p>\n  </td>\n  <td>\n      <img src=\"examples/T2V/Light_blue_water_lapping_on_the_beach.gif\"/>\n      <br>\n<!--       <div class=\"text\" style=\" text-align:center;\">\n        Light blue water lapping on the beach.\n      </div> -->\n          <p align=\"center\">Light blue water lapping on the beach.</p>\n  </td>\n</tr>\n\n</table>\n\n## BibTeX\n```bibtex\n@article{zhuang2024vlogger,\ntitle={Vlogger: Make Your Dream A Vlog},\nauthor={Zhuang, Shaobin and Li, Kunchang and Chen, Xinyuan and Wang, Yaohui and Liu, Ziwei and Qiao, Yu and Wang, Yali},\njournal={arXiv preprint arXiv:2401.09414},\nyear={2024}\n}\n```\n\n```bibtex\n@article{chen2023seine,\ntitle={SEINE: Short-to-Long Video Diffusion Model for Generative Transition and Prediction},\nauthor={Chen, Xinyuan and Wang, Yaohui and Zhang, Lingjun and Zhuang, Shaobin and Ma, Xin and Yu, Jiashuo and Wang, Yali and Lin, Dahua and Qiao, Yu and Liu, Ziwei},\njournal={arXiv preprint arXiv:2310.20700},\nyear={2023}\n}\n```\n\n```bibtex\n@article{wang2023lavie,\n  title={LAVIE: High-Quality Video Generation with Cascaded Latent Diffusion Models},\n  author={Wang, Yaohui and Chen, Xinyuan and Ma, Xin and Zhou, Shangchen and Huang, Ziqi and Wang, Yi and Yang, Ceyuan and He, Yinan and Yu, Jiashuo and Yang, Peiqing and others},\n  journal={arXiv preprint arXiv:2309.15103},\n  year={2023}\n}\n```\n\n\n## Disclaimer\nWe disclaim responsibility for user-generated content. The model was not trained to realistically represent people or events, so using it to generate such content is beyond the model's capabilities. It is prohibited for pornographic, violent and bloody content generation, and to generate content that is demeaning or harmful to people or their environment, culture, religion, etc. Users are solely liable for their actions. The project contributors are not legally affiliated with, nor accountable for users' behaviors. Use the generative model responsibly, adhering to ethical and legal standards.\n\n## Contact Us\n**Shaobin Zhuang**: [zhuangshaobin@pjlab.org.cn](mailto:zhuangshaobin@pjlab.org.cn), **Kunchang Li**: [likunchang@pjlab.org.cn](mailto:likunchang@pjlab.org.cn)\n\n**Xinyuan Chen**: [chenxinyuan@pjlab.org.cn](mailto:chenxinyuan@pjlab.org.cn), **Yaohui Wang**: [wangyaohui@pjlab.org.cn](mailto:wangyaohui@pjlab.org.cn)  \n\n## Acknowledgements\nThe code is built upon [SEINE](https://github.com/Vchitect/SEINE), [LaVie](https://github.com/Vchitect/LaVie), [diffusers](https://github.com/huggingface/diffusers) and [Stable Diffusion](https://github.com/CompVis/stable-diffusion), we thank all the contributors for open-sourcing. \n\n\n## License\n\nThe code is licensed under Apache-2.0, model weights are fully open for academic research and also allow **free** commercial usage. To apply for a commercial license, please contact zhuangshaobin@pjlab.org.cn.\n\n"
  },
  {
    "path": "configs/vlog_read_script_sample.yaml",
    "content": "# path:\nckpt: \"pretrained/ShowMaker.pt\"\npretrained_model_path: \"pretrained/stable-diffusion-v1-4/\"\nimage_encoder_path: \"pretrained/OpenCLIP-ViT-H-14\"\nsave_path: \"results/vlog/teddy_travel/video\"\n\n# script path\nreference_image_path: [\"results/vlog/teddy_travel/ref_img/teddy.jpg\"]\nscript_file_path: \"results/vlog/teddy_travel/script/video_prompts.txt\"\nzh_script_file_path: \"results/vlog/teddy_travel/script/zh_video_prompts.txt\"\nprotagonist_file_path: \"results/vlog/teddy_travel/script/protagonists_places.txt\"\nreference_file_path: \"results/vlog/teddy_travel/script/protagonist_place_reference.txt\"\ntime_file_path: \"results/vlog/teddy_travel/script/time_scripts.txt\"\nvideo_transition: False\n\n# model config:\nmodel: UNet\nnum_frames: 16\nimage_size: [320, 512]\nnegative_prompt: \"white background\"\n\n# sample config:\nref_cfg_scale: 0.3\nseed: 3407\nguidance_scale: 7.5\ncfg_scale: 8.0\nsample_method: 'ddim'\nnum_sampling_steps: 100\nresearve_frame: 3\nmask_type: \"first3\"\nuse_mask: True\nuse_fp16: True\nenable_xformers_memory_efficient_attention: True\ndo_classifier_free_guidance: True\nfps: 8\nsample_num: \n\n# model speedup\nuse_compile: False"
  },
  {
    "path": "configs/vlog_write_script.yaml",
    "content": "# script path\nstory_path: \"./results/vlog/teddy_travel_/story.txt\"\nonly_one_protagonist: False"
  },
  {
    "path": "configs/with_mask_ref_sample.yaml",
    "content": "# path config:\nckpt: \"pretrained/ShowMaker.pt\"\npretrained_model_path: \"pretrained/stable-diffusion-v1-4/\"\nimage_encoder_path: \"pretrained/OpenCLIP-ViT-H-14\"\ninput_path: 'input/i2v/Planet_hits_earth.png'\nref_path: 'input/i2v/Planet_hits_earth.png'\nsave_path: \"results/mask_ref/\"\n\n# model config: \nmodel: UNet\nnum_frames: 16\n# image_size: [320, 512]\nimage_size: [240, 560]\n\n# model speedup\nuse_fp16: True\nenable_xformers_memory_efficient_attention: True\n\n# sample config:\nseed: 3407\ncfg_scale: 8.0\nref_cfg_scale: 0.5\nsample_method: 'ddim'\nnum_sampling_steps: 100\ntext_prompt: [\n              # \"Cinematic photograph. View of piloting aaero.\",\n              # \"A fish swims past an oriental woman.\",\n              # \"A big drop of water falls on a rose petal.\",\n              # \"Underwater environment cosmetic bottles.\".\n              \"Planet hits earth.\",\n              ]\nadditional_prompt: \"\"\nnegative_prompt: \"\"\ndo_classifier_free_guidance: True\nmask_type: \"first1\"\nuse_mask: True\n"
  },
  {
    "path": "configs/with_mask_sample.yaml",
    "content": "# path config:\nckpt: \"pretrained/ShowMaker.pt\"\npretrained_model_path: \"pretrained/OpenCLIP-ViT-H-14\"\ninput_path: 'input/i2v/Planet_hits_earth.png'\nsave_path: \"results/mask_no_ref/\"\n\n# model config: \nmodel: UNet\nnum_frames: 16\n# image_size: [320, 512]\nimage_size: [240, 560]\n\n# model speedup\nuse_fp16: True\nenable_xformers_memory_efficient_attention: True\n\n# sample config:\nseed: 3407\ncfg_scale: 8.0\nsample_method: 'ddim'\nnum_sampling_steps: 100\ntext_prompt: [\n              # \"Cinematic photograph. View of piloting aaero.\",\n              # \"A fish swims past an oriental woman.\",\n              # \"A big drop of water falls on a rose petal.\",\n              # \"Underwater environment cosmetic bottles.\".\n              \"Planet hits earth.\",\n              ]\nadditional_prompt: \"\"\nnegative_prompt: \"\"\ndo_classifier_free_guidance: True\nmask_type: \"first1\"\nuse_mask: True\n"
  },
  {
    "path": "datasets/video_transforms.py",
    "content": "import torch\r\nimport random\r\nimport numbers\r\nfrom torchvision.transforms import RandomCrop, RandomResizedCrop\r\nfrom PIL import Image\r\n\r\ndef _is_tensor_video_clip(clip):\r\n    if not torch.is_tensor(clip):\r\n        raise TypeError(\"clip should be Tensor. Got %s\" % type(clip))\r\n\r\n    if not clip.ndimension() == 4:\r\n        raise ValueError(\"clip should be 4D. Got %dD\" % clip.dim())\r\n\r\n    return True\r\n\r\n\r\ndef center_crop_arr(pil_image, image_size):\r\n    \"\"\"\r\n    Center cropping implementation from ADM.\r\n    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126\r\n    \"\"\"\r\n    while min(*pil_image.size) >= 2 * image_size:\r\n        pil_image = pil_image.resize(\r\n            tuple(x // 2 for x in pil_image.size), resample=Image.BOX\r\n        )\r\n\r\n    scale = image_size / min(*pil_image.size)\r\n    pil_image = pil_image.resize(\r\n        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC\r\n    )\r\n\r\n    arr = np.array(pil_image)\r\n    crop_y = (arr.shape[0] - image_size) // 2\r\n    crop_x = (arr.shape[1] - image_size) // 2\r\n    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])\r\n\r\n\r\ndef crop(clip, i, j, h, w):\r\n    \"\"\"\r\n    Args:\r\n        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)\r\n    \"\"\"\r\n    if len(clip.size()) != 4:\r\n        raise ValueError(\"clip should be a 4D tensor\")\r\n    return clip[..., i : i + h, j : j + w]\r\n\r\n\r\ndef resize(clip, target_size, interpolation_mode):\r\n    if len(target_size) != 2:\r\n        raise ValueError(f\"target size should be tuple (height, width), instead got {target_size}\")\r\n    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)\r\n\r\ndef resize_scale(clip, target_size, interpolation_mode):\r\n    if len(target_size) != 2:\r\n        raise ValueError(f\"target size should be tuple (height, width), instead got {target_size}\")\r\n    H, W = clip.size(-2), clip.size(-1)\r\n    scale_ = target_size[0] / min(H, W)\r\n    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)\r\n\r\ndef resize_with_scale_factor(clip, scale_factor, interpolation_mode):\r\n    return torch.nn.functional.interpolate(clip, scale_factor=scale_factor, mode=interpolation_mode, align_corners=False)\r\n\r\ndef resize_scale_with_height(clip, target_size, interpolation_mode):\r\n    H, W = clip.size(-2), clip.size(-1)\r\n    scale_ = target_size / H\r\n    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)\r\n\r\ndef resize_scale_with_weight(clip, target_size, interpolation_mode):\r\n    H, W = clip.size(-2), clip.size(-1)\r\n    scale_ = target_size / W\r\n    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)\r\n\r\n\r\ndef resized_crop(clip, i, j, h, w, size, interpolation_mode=\"bilinear\"):\r\n    \"\"\"\r\n    Do spatial cropping and resizing to the video clip\r\n    Args:\r\n        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)\r\n        i (int): i in (i,j) i.e coordinates of the upper left corner.\r\n        j (int): j in (i,j) i.e coordinates of the upper left corner.\r\n        h (int): Height of the cropped region.\r\n        w (int): Width of the cropped region.\r\n        size (tuple(int, int)): height and width of resized clip\r\n    Returns:\r\n        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)\r\n    \"\"\"\r\n    if not _is_tensor_video_clip(clip):\r\n        raise ValueError(\"clip should be a 4D torch.tensor\")\r\n    clip = crop(clip, i, j, h, w)\r\n    clip = resize(clip, size, interpolation_mode)\r\n    return clip\r\n\r\n\r\ndef center_crop(clip, crop_size):\r\n    if not _is_tensor_video_clip(clip):\r\n        raise ValueError(\"clip should be a 4D torch.tensor\")\r\n    h, w = clip.size(-2), clip.size(-1)\r\n    # print(clip.shape)\r\n    th, tw = crop_size\r\n    if h < th or w < tw:\r\n        # print(h, w)\r\n        raise ValueError(\"height {} and width {} must be no smaller than crop_size\".format(h, w))\r\n\r\n    i = int(round((h - th) / 2.0))\r\n    j = int(round((w - tw) / 2.0))\r\n    return crop(clip, i, j, th, tw)\r\n\r\n\r\ndef center_crop_using_short_edge(clip):\r\n    if not _is_tensor_video_clip(clip):\r\n        raise ValueError(\"clip should be a 4D torch.tensor\")\r\n    h, w = clip.size(-2), clip.size(-1)\r\n    if h < w:\r\n        th, tw = h, h\r\n        i = 0\r\n        j = int(round((w - tw) / 2.0))\r\n    else:\r\n        th, tw = w, w\r\n        i = int(round((h - th) / 2.0))\r\n        j = 0\r\n    return crop(clip, i, j, th, tw)\r\n\r\n\r\ndef random_shift_crop(clip):\r\n    '''\r\n    Slide along the long edge, with the short edge as crop size\r\n    '''\r\n    if not _is_tensor_video_clip(clip):\r\n        raise ValueError(\"clip should be a 4D torch.tensor\")\r\n    h, w = clip.size(-2), clip.size(-1)\r\n    \r\n    if h <= w:\r\n        long_edge = w\r\n        short_edge = h\r\n    else:\r\n        long_edge = h\r\n        short_edge =w\r\n\r\n    th, tw = short_edge, short_edge\r\n\r\n    i = torch.randint(0, h - th + 1, size=(1,)).item()\r\n    j = torch.randint(0, w - tw + 1, size=(1,)).item()\r\n    return crop(clip, i, j, th, tw)\r\n\r\n\r\ndef to_tensor(clip):\r\n    \"\"\"\r\n    Convert tensor data type from uint8 to float, divide value by 255.0 and\r\n    permute the dimensions of clip tensor\r\n    Args:\r\n        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)\r\n    Return:\r\n        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)\r\n    \"\"\"\r\n    _is_tensor_video_clip(clip)\r\n    if not clip.dtype == torch.uint8:\r\n        raise TypeError(\"clip tensor should have data type uint8. Got %s\" % str(clip.dtype))\r\n    # return clip.float().permute(3, 0, 1, 2) / 255.0\r\n    return clip.float() / 255.0\r\n\r\n\r\ndef normalize(clip, mean, std, inplace=False):\r\n    \"\"\"\r\n    Args:\r\n        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)\r\n        mean (tuple): pixel RGB mean. Size is (3)\r\n        std (tuple): pixel standard deviation. Size is (3)\r\n    Returns:\r\n        normalized clip (torch.tensor): Size is (T, C, H, W)\r\n    \"\"\"\r\n    if not _is_tensor_video_clip(clip):\r\n        raise ValueError(\"clip should be a 4D torch.tensor\")\r\n    if not inplace:\r\n        clip = clip.clone()\r\n    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)\r\n    # print(mean)\r\n    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)\r\n    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])\r\n    return clip\r\n\r\n\r\ndef hflip(clip):\r\n    \"\"\"\r\n    Args:\r\n        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)\r\n    Returns:\r\n        flipped clip (torch.tensor): Size is (T, C, H, W)\r\n    \"\"\"\r\n    if not _is_tensor_video_clip(clip):\r\n        raise ValueError(\"clip should be a 4D torch.tensor\")\r\n    return clip.flip(-1)\r\n\r\n\r\nclass RandomCropVideo:\r\n    def __init__(self, size):\r\n        if isinstance(size, numbers.Number):\r\n            self.size = (int(size), int(size))\r\n        else:\r\n            self.size = size\r\n\r\n    def __call__(self, clip):\r\n        \"\"\"\r\n        Args:\r\n            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)\r\n        Returns:\r\n            torch.tensor: randomly cropped video clip.\r\n                size is (T, C, OH, OW)\r\n        \"\"\"\r\n        i, j, h, w = self.get_params(clip)\r\n        return crop(clip, i, j, h, w)\r\n    \r\n    def get_params(self, clip):\r\n        h, w = clip.shape[-2:]\r\n        th, tw = self.size\r\n\r\n        if h < th or w < tw:\r\n            raise ValueError(f\"Required crop size {(th, tw)} is larger than input image size {(h, w)}\")\r\n\r\n        if w == tw and h == th:\r\n            return 0, 0, h, w\r\n\r\n        i = torch.randint(0, h - th + 1, size=(1,)).item()\r\n        j = torch.randint(0, w - tw + 1, size=(1,)).item()\r\n\r\n        return i, j, th, tw\r\n\r\n    def __repr__(self) -> str:\r\n        return f\"{self.__class__.__name__}(size={self.size})\"\r\n    \r\nclass CenterCropResizeVideo:\r\n    '''\r\n    First use the short side for cropping length, \r\n    center crop video, then resize to the specified size\r\n    '''\r\n    def __init__(\r\n        self,\r\n        size,\r\n        interpolation_mode=\"bilinear\",\r\n    ):\r\n        if isinstance(size, tuple):\r\n            if len(size) != 2:\r\n                raise ValueError(f\"size should be tuple (height, width), instead got {size}\")\r\n            self.size = size\r\n        else:\r\n            self.size = (size, size)\r\n\r\n        self.interpolation_mode = interpolation_mode\r\n       \r\n\r\n    def __call__(self, clip):\r\n        \"\"\"\r\n        Args:\r\n            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)\r\n        Returns:\r\n            torch.tensor: scale resized / center cropped video clip.\r\n                size is (T, C, crop_size, crop_size)\r\n        \"\"\"\r\n        # print(clip.shape)\r\n        clip_center_crop = center_crop_using_short_edge(clip)\r\n        # print(clip_center_crop.shape) 320 512\r\n        clip_center_crop_resize = resize(clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode)\r\n        return clip_center_crop_resize\r\n\r\n    def __repr__(self) -> str:\r\n        return f\"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}\"\r\n \r\n\r\nclass CenterCropVideo:\r\n    def __init__(\r\n        self,\r\n        size,\r\n        interpolation_mode=\"bilinear\",\r\n    ):\r\n        if isinstance(size, tuple):\r\n            if len(size) != 2:\r\n                raise ValueError(f\"size should be tuple (height, width), instead got {size}\")\r\n            self.size = size\r\n        else:\r\n            self.size = (size, size)\r\n\r\n        self.interpolation_mode = interpolation_mode\r\n       \r\n\r\n    def __call__(self, clip):\r\n        \"\"\"\r\n        Args:\r\n            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)\r\n        Returns:\r\n            torch.tensor: center cropped video clip.\r\n                size is (T, C, crop_size, crop_size)\r\n        \"\"\"\r\n        clip_center_crop = center_crop(clip, self.size)\r\n        return clip_center_crop\r\n\r\n    def __repr__(self) -> str:\r\n        return f\"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}\"\r\n    \r\n\r\nclass NormalizeVideo:\r\n    \"\"\"\r\n    Normalize the video clip by mean subtraction and division by standard deviation\r\n    Args:\r\n        mean (3-tuple): pixel RGB mean\r\n        std (3-tuple): pixel RGB standard deviation\r\n        inplace (boolean): whether do in-place normalization\r\n    \"\"\"\r\n\r\n    def __init__(self, mean, std, inplace=False):\r\n        self.mean = mean\r\n        self.std = std\r\n        self.inplace = inplace\r\n\r\n    def __call__(self, clip):\r\n        \"\"\"\r\n        Args:\r\n            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)\r\n        \"\"\"\r\n        return normalize(clip, self.mean, self.std, self.inplace)\r\n\r\n    def __repr__(self) -> str:\r\n        return f\"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})\"\r\n\r\n\r\nclass ToTensorVideo:\r\n    \"\"\"\r\n    Convert tensor data type from uint8 to float, divide value by 255.0 and\r\n    permute the dimensions of clip tensor\r\n    \"\"\"\r\n\r\n    def __init__(self):\r\n        pass\r\n\r\n    def __call__(self, clip):\r\n        \"\"\"\r\n        Args:\r\n            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)\r\n        Return:\r\n            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)\r\n        \"\"\"\r\n        return to_tensor(clip)\r\n\r\n    def __repr__(self) -> str:\r\n        return self.__class__.__name__\r\n\r\n\r\nclass ResizeVideo():\r\n    '''\r\n    First use the short side for cropping length, \r\n    center crop video, then resize to the specified size\r\n    '''\r\n    def __init__(\r\n        self,\r\n        size,\r\n        interpolation_mode=\"bilinear\",\r\n    ):\r\n        if isinstance(size, tuple):\r\n            if len(size) != 2:\r\n                raise ValueError(f\"size should be tuple (height, width), instead got {size}\")\r\n            self.size = size\r\n        else:\r\n            self.size = (size, size)\r\n\r\n        self.interpolation_mode = interpolation_mode\r\n       \r\n\r\n    def __call__(self, clip):\r\n        \"\"\"\r\n        Args:\r\n            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)\r\n        Returns:\r\n            torch.tensor: scale resized / center cropped video clip.\r\n                size is (T, C, crop_size, crop_size)\r\n        \"\"\"\r\n        clip_resize = resize(clip, target_size=self.size, interpolation_mode=self.interpolation_mode)\r\n        return clip_resize\r\n\r\n    def __repr__(self) -> str:\r\n        return f\"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}\"\r\n    \r\n#  ------------------------------------------------------------\r\n#  ---------------------  Sampling  ---------------------------\r\n#  ------------------------------------------------------------\r\n"
  },
  {
    "path": "diffusion/__init__.py",
    "content": "# Modified from OpenAI's diffusion repos\r\n#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py\r\n#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion\r\n#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py\r\n\r\nfrom . import gaussian_diffusion as gd\r\nfrom .respace import SpacedDiffusion, space_timesteps\r\n\r\n\r\ndef create_diffusion(\r\n    timestep_respacing,\r\n    noise_schedule=\"linear\", \r\n    use_kl=False,\r\n    sigma_small=False,\r\n    predict_xstart=False,\r\n    # learn_sigma=True,\r\n    learn_sigma=False, # for unet\r\n    rescale_learned_sigmas=False,\r\n    diffusion_steps=1000\r\n):\r\n    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)\r\n    if use_kl:\r\n        loss_type = gd.LossType.RESCALED_KL\r\n    elif rescale_learned_sigmas:\r\n        loss_type = gd.LossType.RESCALED_MSE\r\n    else:\r\n        loss_type = gd.LossType.MSE\r\n    if timestep_respacing is None or timestep_respacing == \"\":\r\n        timestep_respacing = [diffusion_steps]\r\n    return SpacedDiffusion(\r\n        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),\r\n        betas=betas,\r\n        model_mean_type=(\r\n            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X\r\n        ),\r\n        model_var_type=(\r\n            (\r\n                gd.ModelVarType.FIXED_LARGE\r\n                if not sigma_small\r\n                else gd.ModelVarType.FIXED_SMALL\r\n            )\r\n            if not learn_sigma\r\n            else gd.ModelVarType.LEARNED_RANGE\r\n        ),\r\n        loss_type=loss_type\r\n        # rescale_timesteps=rescale_timesteps,\r\n    )\r\n"
  },
  {
    "path": "diffusion/diffusion_utils.py",
    "content": "# Modified from OpenAI's diffusion repos\r\n#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py\r\n#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion\r\n#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py\r\n\r\nimport torch as th\r\nimport numpy as np\r\n\r\n\r\ndef normal_kl(mean1, logvar1, mean2, logvar2):\r\n    \"\"\"\r\n    Compute the KL divergence between two gaussians.\r\n    Shapes are automatically broadcasted, so batches can be compared to\r\n    scalars, among other use cases.\r\n    \"\"\"\r\n    tensor = None\r\n    for obj in (mean1, logvar1, mean2, logvar2):\r\n        if isinstance(obj, th.Tensor):\r\n            tensor = obj\r\n            break\r\n    assert tensor is not None, \"at least one argument must be a Tensor\"\r\n\r\n    # Force variances to be Tensors. Broadcasting helps convert scalars to\r\n    # Tensors, but it does not work for th.exp().\r\n    logvar1, logvar2 = [\r\n        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)\r\n        for x in (logvar1, logvar2)\r\n    ]\r\n\r\n    return 0.5 * (\r\n        -1.0\r\n        + logvar2\r\n        - logvar1\r\n        + th.exp(logvar1 - logvar2)\r\n        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)\r\n    )\r\n\r\n\r\ndef approx_standard_normal_cdf(x):\r\n    \"\"\"\r\n    A fast approximation of the cumulative distribution function of the\r\n    standard normal.\r\n    \"\"\"\r\n    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))\r\n\r\n\r\ndef continuous_gaussian_log_likelihood(x, *, means, log_scales):\r\n    \"\"\"\r\n    Compute the log-likelihood of a continuous Gaussian distribution.\r\n    :param x: the targets\r\n    :param means: the Gaussian mean Tensor.\r\n    :param log_scales: the Gaussian log stddev Tensor.\r\n    :return: a tensor like x of log probabilities (in nats).\r\n    \"\"\"\r\n    centered_x = x - means\r\n    inv_stdv = th.exp(-log_scales)\r\n    normalized_x = centered_x * inv_stdv\r\n    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)\r\n    return log_probs\r\n\r\n\r\ndef discretized_gaussian_log_likelihood(x, *, means, log_scales):\r\n    \"\"\"\r\n    Compute the log-likelihood of a Gaussian distribution discretizing to a\r\n    given image.\r\n    :param x: the target images. It is assumed that this was uint8 values,\r\n              rescaled to the range [-1, 1].\r\n    :param means: the Gaussian mean Tensor.\r\n    :param log_scales: the Gaussian log stddev Tensor.\r\n    :return: a tensor like x of log probabilities (in nats).\r\n    \"\"\"\r\n    assert x.shape == means.shape == log_scales.shape\r\n    centered_x = x - means\r\n    inv_stdv = th.exp(-log_scales)\r\n    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)\r\n    cdf_plus = approx_standard_normal_cdf(plus_in)\r\n    min_in = inv_stdv * (centered_x - 1.0 / 255.0)\r\n    cdf_min = approx_standard_normal_cdf(min_in)\r\n    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))\r\n    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))\r\n    cdf_delta = cdf_plus - cdf_min\r\n    log_probs = th.where(\r\n        x < -0.999,\r\n        log_cdf_plus,\r\n        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),\r\n    )\r\n    assert log_probs.shape == x.shape\r\n    return log_probs\r\n"
  },
  {
    "path": "diffusion/gaussian_diffusion.py",
    "content": "# Modified from OpenAI's diffusion repos\r\n#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py\r\n#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion\r\n#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py\r\n\r\n\r\nimport math\r\n\r\nimport numpy as np\r\nimport torch as th\r\nimport enum\r\n\r\nfrom .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl\r\n\r\n\r\ndef mean_flat(tensor):\r\n    \"\"\"\r\n    Take the mean over all non-batch dimensions.\r\n    \"\"\"\r\n    return tensor.mean(dim=list(range(1, len(tensor.shape))))\r\n\r\n\r\nclass ModelMeanType(enum.Enum):\r\n    \"\"\"\r\n    Which type of output the model predicts.\r\n    \"\"\"\r\n\r\n    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}\r\n    START_X = enum.auto()  # the model predicts x_0\r\n    EPSILON = enum.auto()  # the model predicts epsilon\r\n\r\n\r\nclass ModelVarType(enum.Enum):\r\n    \"\"\"\r\n    What is used as the model's output variance.\r\n    The LEARNED_RANGE option has been added to allow the model to predict\r\n    values between FIXED_SMALL and FIXED_LARGE, making its job easier.\r\n    \"\"\"\r\n\r\n    LEARNED = enum.auto()\r\n    FIXED_SMALL = enum.auto()\r\n    FIXED_LARGE = enum.auto()\r\n    LEARNED_RANGE = enum.auto()\r\n\r\n\r\nclass LossType(enum.Enum):\r\n    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)\r\n    RESCALED_MSE = (\r\n        enum.auto()\r\n    )  # use raw MSE loss (with RESCALED_KL when learning variances)\r\n    KL = enum.auto()  # use the variational lower-bound\r\n    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB\r\n\r\n    def is_vb(self):\r\n        return self == LossType.KL or self == LossType.RESCALED_KL\r\n\r\n\r\ndef _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):\r\n    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)\r\n    warmup_time = int(num_diffusion_timesteps * warmup_frac)\r\n    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)\r\n    return betas\r\n\r\n\r\ndef get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):\r\n    \"\"\"\r\n    This is the deprecated API for creating beta schedules.\r\n    See get_named_beta_schedule() for the new library of schedules.\r\n    \"\"\"\r\n    if beta_schedule == \"quad\":\r\n        betas = (\r\n            np.linspace(\r\n                beta_start ** 0.5,\r\n                beta_end ** 0.5,\r\n                num_diffusion_timesteps,\r\n                dtype=np.float64,\r\n            )\r\n            ** 2\r\n        )\r\n    elif beta_schedule == \"linear\":\r\n        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)\r\n    elif beta_schedule == \"warmup10\":\r\n        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)\r\n    elif beta_schedule == \"warmup50\":\r\n        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)\r\n    elif beta_schedule == \"const\":\r\n        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)\r\n    elif beta_schedule == \"jsd\":  # 1/T, 1/(T-1), 1/(T-2), ..., 1\r\n        betas = 1.0 / np.linspace(\r\n            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64\r\n        )\r\n    else:\r\n        raise NotImplementedError(beta_schedule)\r\n    assert betas.shape == (num_diffusion_timesteps,)\r\n    return betas\r\n\r\n\r\ndef get_named_beta_schedule(schedule_name, num_diffusion_timesteps):\r\n    \"\"\"\r\n    Get a pre-defined beta schedule for the given name.\r\n    The beta schedule library consists of beta schedules which remain similar\r\n    in the limit of num_diffusion_timesteps.\r\n    Beta schedules may be added, but should not be removed or changed once\r\n    they are committed to maintain backwards compatibility.\r\n    \"\"\"\r\n    if schedule_name == \"linear\":\r\n        # Linear schedule from Ho et al, extended to work for any number of\r\n        # diffusion steps.\r\n        scale = 1000 / num_diffusion_timesteps\r\n        return get_beta_schedule(\r\n            \"linear\",\r\n            beta_start=scale * 0.0001,\r\n            beta_end=scale * 0.02,\r\n            # diffuser stable diffusion\r\n            # beta_start=scale * 0.00085,\r\n            # beta_end=scale * 0.012,\r\n            num_diffusion_timesteps=num_diffusion_timesteps,\r\n        )\r\n    elif schedule_name == \"squaredcos_cap_v2\":\r\n        return betas_for_alpha_bar(\r\n            num_diffusion_timesteps,\r\n            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,\r\n        )\r\n    else:\r\n        raise NotImplementedError(f\"unknown beta schedule: {schedule_name}\")\r\n\r\n\r\ndef betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):\r\n    \"\"\"\r\n    Create a beta schedule that discretizes the given alpha_t_bar function,\r\n    which defines the cumulative product of (1-beta) over time from t = [0,1].\r\n    :param num_diffusion_timesteps: the number of betas to produce.\r\n    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and\r\n                      produces the cumulative product of (1-beta) up to that\r\n                      part of the diffusion process.\r\n    :param max_beta: the maximum beta to use; use values lower than 1 to\r\n                     prevent singularities.\r\n    \"\"\"\r\n    betas = []\r\n    for i in range(num_diffusion_timesteps):\r\n        t1 = i / num_diffusion_timesteps\r\n        t2 = (i + 1) / num_diffusion_timesteps\r\n        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))\r\n    return np.array(betas)\r\n\r\n\r\nclass GaussianDiffusion:\r\n    \"\"\"\r\n    Utilities for training and sampling diffusion models.\r\n    Original ported from this codebase:\r\n    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42\r\n    :param betas: a 1-D numpy array of betas for each diffusion timestep,\r\n                  starting at T and going to 1.\r\n    \"\"\"\r\n\r\n    def __init__(\r\n        self,\r\n        *,\r\n        betas,\r\n        model_mean_type,\r\n        model_var_type,\r\n        loss_type\r\n    ):\r\n\r\n        self.model_mean_type = model_mean_type\r\n        self.model_var_type = model_var_type\r\n        self.loss_type = loss_type\r\n\r\n        # Use float64 for accuracy.\r\n        betas = np.array(betas, dtype=np.float64)\r\n        self.betas = betas\r\n        assert len(betas.shape) == 1, \"betas must be 1-D\"\r\n        assert (betas > 0).all() and (betas <= 1).all()\r\n\r\n        self.num_timesteps = int(betas.shape[0])\r\n\r\n        alphas = 1.0 - betas\r\n        self.alphas_cumprod = np.cumprod(alphas, axis=0)\r\n        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])\r\n        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)\r\n        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)\r\n\r\n        # calculations for diffusion q(x_t | x_{t-1}) and others\r\n        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)\r\n        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)\r\n        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)\r\n        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)\r\n        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)\r\n\r\n        # calculations for posterior q(x_{t-1} | x_t, x_0)\r\n        self.posterior_variance = (\r\n            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)\r\n        )\r\n        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain\r\n        self.posterior_log_variance_clipped = np.log(\r\n            np.append(self.posterior_variance[1], self.posterior_variance[1:])\r\n        ) if len(self.posterior_variance) > 1 else np.array([])\r\n\r\n        self.posterior_mean_coef1 = (\r\n            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)\r\n        )\r\n        self.posterior_mean_coef2 = (\r\n            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)\r\n        )\r\n\r\n    def q_mean_variance(self, x_start, t):\r\n        \"\"\"\r\n        Get the distribution q(x_t | x_0).\r\n        :param x_start: the [N x C x ...] tensor of noiseless inputs.\r\n        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.\r\n        :return: A tuple (mean, variance, log_variance), all of x_start's shape.\r\n        \"\"\"\r\n        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start\r\n        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)\r\n        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)\r\n        return mean, variance, log_variance\r\n\r\n    def q_sample(self, x_start, t, noise=None):\r\n        \"\"\"\r\n        Diffuse the data for a given number of diffusion steps.\r\n        In other words, sample from q(x_t | x_0).\r\n        :param x_start: the initial data batch.\r\n        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.\r\n        :param noise: if specified, the split-out normal noise.\r\n        :return: A noisy version of x_start.\r\n        \"\"\"\r\n        if noise is None:\r\n            noise = th.randn_like(x_start)\r\n        assert noise.shape == x_start.shape\r\n        return (\r\n            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start\r\n            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise\r\n        )\r\n\r\n    def q_posterior_mean_variance(self, x_start, x_t, t):\r\n        \"\"\"\r\n        Compute the mean and variance of the diffusion posterior:\r\n            q(x_{t-1} | x_t, x_0)\r\n        \"\"\"\r\n        assert x_start.shape == x_t.shape\r\n        posterior_mean = (\r\n            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start\r\n            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t\r\n        )\r\n        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)\r\n        posterior_log_variance_clipped = _extract_into_tensor(\r\n            self.posterior_log_variance_clipped, t, x_t.shape\r\n        )\r\n        assert (\r\n            posterior_mean.shape[0]\r\n            == posterior_variance.shape[0]\r\n            == posterior_log_variance_clipped.shape[0]\r\n            == x_start.shape[0]\r\n        )\r\n        return posterior_mean, posterior_variance, posterior_log_variance_clipped\r\n\r\n    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None,\r\n                        mask=None, x_start=None, use_concat=False):\r\n        \"\"\"\r\n        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of\r\n        the initial x, x_0.\r\n        :param model: the model, which takes a signal and a batch of timesteps\r\n                      as input.\r\n        :param x: the [N x C x ...] tensor at time t.\r\n        :param t: a 1-D Tensor of timesteps.\r\n        :param clip_denoised: if True, clip the denoised signal into [-1, 1].\r\n        :param denoised_fn: if not None, a function which applies to the\r\n            x_start prediction before it is used to sample. Applies before\r\n            clip_denoised.\r\n        :param model_kwargs: if not None, a dict of extra keyword arguments to\r\n            pass to the model. This can be used for conditioning.\r\n        :return: a dict with the following keys:\r\n                 - 'mean': the model mean output.\r\n                 - 'variance': the model variance output.\r\n                 - 'log_variance': the log of 'variance'.\r\n                 - 'pred_xstart': the prediction for x_0.\r\n        \"\"\"\r\n        if model_kwargs is None:\r\n            model_kwargs = {}\r\n\r\n        B, F, C = x.shape[:3]\r\n        assert t.shape == (B,)\r\n        if use_concat:\r\n            model_output = model(th.concat([x, mask, x_start], dim=1), t, **model_kwargs)\r\n        else:\r\n            model_output = model(x, t, **model_kwargs)\r\n        try:\r\n            model_output = model_output.sample # for tav unet\r\n        except:\r\n            pass\r\n            # model_output = model(x, t, **model_kwargs)\r\n        if isinstance(model_output, tuple):\r\n            model_output, extra = model_output\r\n        else:\r\n            extra = None\r\n\r\n        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:\r\n            assert model_output.shape == (B, F, C * 2, *x.shape[3:])\r\n            model_output, model_var_values = th.split(model_output, C, dim=2)\r\n            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)\r\n            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)\r\n            # The model_var_values is [-1, 1] for [min_var, max_var].\r\n            frac = (model_var_values + 1) / 2\r\n            model_log_variance = frac * max_log + (1 - frac) * min_log\r\n            model_variance = th.exp(model_log_variance)\r\n        else:\r\n            model_variance, model_log_variance = {\r\n                # for fixedlarge, we set the initial (log-)variance like so\r\n                # to get a better decoder log likelihood.\r\n                ModelVarType.FIXED_LARGE: (\r\n                    np.append(self.posterior_variance[1], self.betas[1:]),\r\n                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),\r\n                ),\r\n                ModelVarType.FIXED_SMALL: (\r\n                    self.posterior_variance,\r\n                    self.posterior_log_variance_clipped,\r\n                ),\r\n            }[self.model_var_type]\r\n            model_variance = _extract_into_tensor(model_variance, t, x.shape)\r\n            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)\r\n\r\n        def process_xstart(x):\r\n            if denoised_fn is not None:\r\n                x = denoised_fn(x)\r\n            if clip_denoised:\r\n                return x.clamp(-1, 1)\r\n            return x\r\n\r\n        if self.model_mean_type == ModelMeanType.START_X:\r\n            pred_xstart = process_xstart(model_output)\r\n        else:\r\n            pred_xstart = process_xstart(\r\n                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)\r\n            )\r\n        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)\r\n\r\n        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape\r\n        return {\r\n            \"mean\": model_mean,\r\n            \"variance\": model_variance,\r\n            \"log_variance\": model_log_variance,\r\n            \"pred_xstart\": pred_xstart,\r\n            \"extra\": extra,\r\n        }\r\n\r\n    def _predict_xstart_from_eps(self, x_t, t, eps):\r\n        assert x_t.shape == eps.shape\r\n        return (\r\n            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t\r\n            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps\r\n        )\r\n\r\n    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):\r\n        return (\r\n            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart\r\n        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)\r\n\r\n    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):\r\n        \"\"\"\r\n        Compute the mean for the previous step, given a function cond_fn that\r\n        computes the gradient of a conditional log probability with respect to\r\n        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to\r\n        condition on y.\r\n        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).\r\n        \"\"\"\r\n        gradient = cond_fn(x, t, **model_kwargs)\r\n        new_mean = p_mean_var[\"mean\"].float() + p_mean_var[\"variance\"] * gradient.float()\r\n        return new_mean\r\n\r\n    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):\r\n        \"\"\"\r\n        Compute what the p_mean_variance output would have been, should the\r\n        model's score function be conditioned by cond_fn.\r\n        See condition_mean() for details on cond_fn.\r\n        Unlike condition_mean(), this instead uses the conditioning strategy\r\n        from Song et al (2020).\r\n        \"\"\"\r\n        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)\r\n\r\n        eps = self._predict_eps_from_xstart(x, t, p_mean_var[\"pred_xstart\"])\r\n        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)\r\n\r\n        out = p_mean_var.copy()\r\n        out[\"pred_xstart\"] = self._predict_xstart_from_eps(x, t, eps)\r\n        out[\"mean\"], _, _ = self.q_posterior_mean_variance(x_start=out[\"pred_xstart\"], x_t=x, t=t)\r\n        return out\r\n\r\n    def p_sample(\r\n        self,\r\n        model,\r\n        x,\r\n        t,\r\n        clip_denoised=True,\r\n        denoised_fn=None,\r\n        cond_fn=None,\r\n        model_kwargs=None,\r\n        mask=None,\r\n        x_start=None,\r\n        use_concat=False\r\n    ):\r\n        \"\"\"\r\n        Sample x_{t-1} from the model at the given timestep.\r\n        :param model: the model to sample from.\r\n        :param x: the current tensor at x_{t-1}.\r\n        :param t: the value of t, starting at 0 for the first diffusion step.\r\n        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].\r\n        :param denoised_fn: if not None, a function which applies to the\r\n            x_start prediction before it is used to sample.\r\n        :param cond_fn: if not None, this is a gradient function that acts\r\n                        similarly to the model.\r\n        :param model_kwargs: if not None, a dict of extra keyword arguments to\r\n            pass to the model. This can be used for conditioning.\r\n        :return: a dict containing the following keys:\r\n                 - 'sample': a random sample from the model.\r\n                 - 'pred_xstart': a prediction of x_0.\r\n        \"\"\"\r\n        out = self.p_mean_variance(\r\n            model,\r\n            x,\r\n            t,\r\n            clip_denoised=clip_denoised,\r\n            denoised_fn=denoised_fn,\r\n            model_kwargs=model_kwargs,\r\n            mask=mask,\r\n            x_start=x_start,\r\n            use_concat=use_concat\r\n        )\r\n        noise = th.randn_like(x)\r\n        nonzero_mask = (\r\n            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))\r\n        )  # no noise when t == 0\r\n        if cond_fn is not None:\r\n            out[\"mean\"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)\r\n        sample = out[\"mean\"] + nonzero_mask * th.exp(0.5 * out[\"log_variance\"]) * noise\r\n        return {\"sample\": sample, \"pred_xstart\": out[\"pred_xstart\"]}\r\n\r\n    def p_sample_loop(\r\n        self,\r\n        model,\r\n        shape,\r\n        noise=None,\r\n        clip_denoised=True,\r\n        denoised_fn=None,\r\n        cond_fn=None,\r\n        model_kwargs=None,\r\n        device=None,\r\n        progress=False,\r\n        mask=None,\r\n        x_start=None,\r\n        use_concat=False,\r\n    ):\r\n        \"\"\"\r\n        Generate samples from the model.\r\n        :param model: the model module.\r\n        :param shape: the shape of the samples, (N, C, H, W).\r\n        :param noise: if specified, the noise from the encoder to sample.\r\n                      Should be of the same shape as `shape`.\r\n        :param clip_denoised: if True, clip x_start predictions to [-1, 1].\r\n        :param denoised_fn: if not None, a function which applies to the\r\n            x_start prediction before it is used to sample.\r\n        :param cond_fn: if not None, this is a gradient function that acts\r\n                        similarly to the model.\r\n        :param model_kwargs: if not None, a dict of extra keyword arguments to\r\n            pass to the model. This can be used for conditioning.\r\n        :param device: if specified, the device to create the samples on.\r\n                       If not specified, use a model parameter's device.\r\n        :param progress: if True, show a tqdm progress bar.\r\n        :return: a non-differentiable batch of samples.\r\n        \"\"\"\r\n        final = None\r\n        for sample in self.p_sample_loop_progressive(\r\n            model,\r\n            shape,\r\n            noise=noise,\r\n            clip_denoised=clip_denoised,\r\n            denoised_fn=denoised_fn,\r\n            cond_fn=cond_fn,\r\n            model_kwargs=model_kwargs,\r\n            device=device,\r\n            progress=progress,\r\n            mask=mask,\r\n            x_start=x_start,\r\n            use_concat=use_concat\r\n        ):\r\n            final = sample\r\n        return final[\"sample\"]\r\n\r\n    def p_sample_loop_progressive(\r\n        self,\r\n        model,\r\n        shape,\r\n        noise=None,\r\n        clip_denoised=True,\r\n        denoised_fn=None,\r\n        cond_fn=None,\r\n        model_kwargs=None,\r\n        device=None,\r\n        progress=False,\r\n        mask=None,\r\n        x_start=None,\r\n        use_concat=False\r\n    ):\r\n        \"\"\"\r\n        Generate samples from the model and yield intermediate samples from\r\n        each timestep of diffusion.\r\n        Arguments are the same as p_sample_loop().\r\n        Returns a generator over dicts, where each dict is the return value of\r\n        p_sample().\r\n        \"\"\"\r\n        if device is None:\r\n            device = next(model.parameters()).device\r\n        assert isinstance(shape, (tuple, list))\r\n        if noise is not None:\r\n            img = noise\r\n        else:\r\n            img = th.randn(*shape, device=device)\r\n        indices = list(range(self.num_timesteps))[::-1]\r\n\r\n        if progress:\r\n            # Lazy import so that we don't depend on tqdm.\r\n            from tqdm.auto import tqdm\r\n\r\n            indices = tqdm(indices)\r\n\r\n        for i in indices:\r\n            t = th.tensor([i] * shape[0], device=device)\r\n            with th.no_grad():\r\n                out = self.p_sample(\r\n                    model,\r\n                    img,\r\n                    t,\r\n                    clip_denoised=clip_denoised,\r\n                    denoised_fn=denoised_fn,\r\n                    cond_fn=cond_fn,\r\n                    model_kwargs=model_kwargs,\r\n                    mask=mask,\r\n                    x_start=x_start,\r\n                    use_concat=use_concat\r\n                )\r\n                yield out\r\n                img = out[\"sample\"]\r\n\r\n    def ddim_sample(\r\n        self,\r\n        model,\r\n        x,\r\n        t,\r\n        clip_denoised=True,\r\n        denoised_fn=None,\r\n        cond_fn=None,\r\n        model_kwargs=None,\r\n        eta=0.0,\r\n        mask=None,\r\n        x_start=None,\r\n        use_concat=False\r\n    ):\r\n        \"\"\"\r\n        Sample x_{t-1} from the model using DDIM.\r\n        Same usage as p_sample().\r\n        \"\"\"\r\n        out = self.p_mean_variance(\r\n            model,\r\n            x,\r\n            t,\r\n            clip_denoised=clip_denoised,\r\n            denoised_fn=denoised_fn,\r\n            model_kwargs=model_kwargs,\r\n            mask=mask,\r\n            x_start=x_start,\r\n            use_concat=use_concat\r\n        )\r\n        if cond_fn is not None:\r\n            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)\r\n\r\n        # Usually our model outputs epsilon, but we re-derive it\r\n        # in case we used x_start or x_prev prediction.\r\n        eps = self._predict_eps_from_xstart(x, t, out[\"pred_xstart\"])\r\n\r\n        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)\r\n        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)\r\n        sigma = (\r\n            eta\r\n            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))\r\n            * th.sqrt(1 - alpha_bar / alpha_bar_prev)\r\n        )\r\n        # Equation 12.\r\n        noise = th.randn_like(x)\r\n        mean_pred = (\r\n            out[\"pred_xstart\"] * th.sqrt(alpha_bar_prev)\r\n            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps\r\n        )\r\n        nonzero_mask = (\r\n            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))\r\n        )  # no noise when t == 0\r\n        sample = mean_pred + nonzero_mask * sigma * noise\r\n        return {\"sample\": sample, \"pred_xstart\": out[\"pred_xstart\"]}\r\n\r\n    def ddim_reverse_sample(\r\n        self,\r\n        model,\r\n        x,\r\n        t,\r\n        clip_denoised=True,\r\n        denoised_fn=None,\r\n        cond_fn=None,\r\n        model_kwargs=None,\r\n        eta=0.0,\r\n    ):\r\n        \"\"\"\r\n        Sample x_{t+1} from the model using DDIM reverse ODE.\r\n        \"\"\"\r\n        assert eta == 0.0, \"Reverse ODE only for deterministic path\"\r\n        out = self.p_mean_variance(\r\n            model,\r\n            x,\r\n            t,\r\n            clip_denoised=clip_denoised,\r\n            denoised_fn=denoised_fn,\r\n            model_kwargs=model_kwargs,\r\n        )\r\n        if cond_fn is not None:\r\n            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)\r\n        # Usually our model outputs epsilon, but we re-derive it\r\n        # in case we used x_start or x_prev prediction.\r\n        eps = (\r\n            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x\r\n            - out[\"pred_xstart\"]\r\n        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)\r\n        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)\r\n\r\n        # Equation 12. reversed\r\n        mean_pred = out[\"pred_xstart\"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps\r\n\r\n        return {\"sample\": mean_pred, \"pred_xstart\": out[\"pred_xstart\"]}\r\n\r\n    def ddim_sample_loop(\r\n        self,\r\n        model,\r\n        shape,\r\n        noise=None,\r\n        clip_denoised=True,\r\n        denoised_fn=None,\r\n        cond_fn=None,\r\n        model_kwargs=None,\r\n        device=None,\r\n        progress=False,\r\n        eta=0.0,\r\n        mask=None,\r\n        x_start=None,\r\n        use_concat=False\r\n    ):\r\n        \"\"\"\r\n        Generate samples from the model using DDIM.\r\n        Same usage as p_sample_loop().\r\n        \"\"\"\r\n        final = None\r\n        for sample in self.ddim_sample_loop_progressive(\r\n            model,\r\n            shape,\r\n            noise=noise,\r\n            clip_denoised=clip_denoised,\r\n            denoised_fn=denoised_fn,\r\n            cond_fn=cond_fn,\r\n            model_kwargs=model_kwargs,\r\n            device=device,\r\n            progress=progress,\r\n            eta=eta,\r\n            mask=mask,\r\n            x_start=x_start,\r\n            use_concat=use_concat\r\n        ):\r\n            final = sample\r\n        return final[\"sample\"]\r\n\r\n    def ddim_sample_loop_progressive(\r\n        self,\r\n        model,\r\n        shape,\r\n        noise=None,\r\n        clip_denoised=True,\r\n        denoised_fn=None,\r\n        cond_fn=None,\r\n        model_kwargs=None,\r\n        device=None,\r\n        progress=False,\r\n        eta=0.0,\r\n        mask=None,\r\n        x_start=None,\r\n        use_concat=False\r\n    ):\r\n        \"\"\"\r\n        Use DDIM to sample from the model and yield intermediate samples from\r\n        each timestep of DDIM.\r\n        Same usage as p_sample_loop_progressive().\r\n        \"\"\"\r\n        if device is None:\r\n            device = next(model.parameters()).device\r\n        assert isinstance(shape, (tuple, list))\r\n        if noise is not None:\r\n            img = noise\r\n        else:\r\n            img = th.randn(*shape, device=device)\r\n        indices = list(range(self.num_timesteps))[::-1]\r\n\r\n        if progress:\r\n            # Lazy import so that we don't depend on tqdm.\r\n            from tqdm.auto import tqdm\r\n\r\n            indices = tqdm(indices)\r\n\r\n        for i in indices:\r\n            t = th.tensor([i] * shape[0], device=device)\r\n            with th.no_grad():\r\n                out = self.ddim_sample(\r\n                    model,\r\n                    img,\r\n                    t,\r\n                    clip_denoised=clip_denoised,\r\n                    denoised_fn=denoised_fn,\r\n                    cond_fn=cond_fn,\r\n                    model_kwargs=model_kwargs,\r\n                    eta=eta,\r\n                    mask=mask,\r\n                    x_start=x_start,\r\n                    use_concat=use_concat\r\n                )\r\n                yield out\r\n                img = out[\"sample\"]\r\n\r\n    def _vb_terms_bpd(\r\n            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None\r\n    ):\r\n        \"\"\"\r\n        Get a term for the variational lower-bound.\r\n        The resulting units are bits (rather than nats, as one might expect).\r\n        This allows for comparison to other papers.\r\n        :return: a dict with the following keys:\r\n                 - 'output': a shape [N] tensor of NLLs or KLs.\r\n                 - 'pred_xstart': the x_0 predictions.\r\n        \"\"\"\r\n        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(\r\n            x_start=x_start, x_t=x_t, t=t\r\n        )\r\n        out = self.p_mean_variance(\r\n            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs\r\n        )\r\n        kl = normal_kl(\r\n            true_mean, true_log_variance_clipped, out[\"mean\"], out[\"log_variance\"]\r\n        )\r\n        kl = mean_flat(kl) / np.log(2.0)\r\n\r\n        decoder_nll = -discretized_gaussian_log_likelihood(\r\n            x_start, means=out[\"mean\"], log_scales=0.5 * out[\"log_variance\"]\r\n        )\r\n        assert decoder_nll.shape == x_start.shape\r\n        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)\r\n\r\n        # At the first timestep return the decoder NLL,\r\n        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))\r\n        output = th.where((t == 0), decoder_nll, kl)\r\n        return {\"output\": output, \"pred_xstart\": out[\"pred_xstart\"]}\r\n\r\n    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None, use_mask=False):\r\n        \"\"\"\r\n        Compute training losses for a single timestep.\r\n        :param model: the model to evaluate loss on.\r\n        :param x_start: the [N x C x ...] tensor of inputs.\r\n        :param t: a batch of timestep indices.\r\n        :param model_kwargs: if not None, a dict of extra keyword arguments to\r\n            pass to the model. This can be used for conditioning.\r\n        :param noise: if specified, the specific Gaussian noise to try to remove.\r\n        :return: a dict with the key \"loss\" containing a tensor of shape [N].\r\n                 Some mean or variance settings may also have other keys.\r\n        \"\"\"\r\n        if model_kwargs is None:\r\n            model_kwargs = {}\r\n        if noise is None:\r\n            noise = th.randn_like(x_start)\r\n        x_t = self.q_sample(x_start, t, noise=noise)\r\n        if use_mask:\r\n            x_t = th.cat([x_t[:, :4], x_start[:, 4:]], dim=1)\r\n        terms = {}\r\n\r\n        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:\r\n            terms[\"loss\"] = self._vb_terms_bpd(\r\n                model=model,\r\n                x_start=x_start,\r\n                x_t=x_t,\r\n                t=t,\r\n                clip_denoised=False,\r\n                model_kwargs=model_kwargs,\r\n            )[\"output\"]\r\n            if self.loss_type == LossType.RESCALED_KL:\r\n                terms[\"loss\"] *= self.num_timesteps\r\n        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:\r\n            model_output = model(x_t, t, **model_kwargs)\r\n            try:\r\n                # model_output = model(x_t, t, **model_kwargs).sample\r\n                model_output = model_output.sample # for tav unet\r\n            except:\r\n                pass\r\n                # model_output = model(x_t, t, **model_kwargs)\r\n\r\n            if self.model_var_type in [\r\n                ModelVarType.LEARNED,\r\n                ModelVarType.LEARNED_RANGE,\r\n            ]:\r\n                B, F, C = x_t.shape[:3]\r\n                assert model_output.shape == (B, F, C * 2, *x_t.shape[3:])\r\n                model_output, model_var_values = th.split(model_output, C, dim=2)\r\n                # Learn the variance using the variational bound, but don't let\r\n                # it affect our mean prediction.\r\n                frozen_out = th.cat([model_output.detach(), model_var_values], dim=2)\r\n                terms[\"vb\"] = self._vb_terms_bpd(\r\n                    model=lambda *args, r=frozen_out: r,\r\n                    x_start=x_start,\r\n                    x_t=x_t,\r\n                    t=t,\r\n                    clip_denoised=False,\r\n                )[\"output\"]\r\n                if self.loss_type == LossType.RESCALED_MSE:\r\n                    # Divide by 1000 for equivalence with initial implementation.\r\n                    # Without a factor of 1/1000, the VB term hurts the MSE term.\r\n                    terms[\"vb\"] *= self.num_timesteps / 1000.0\r\n\r\n            target = {\r\n                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(\r\n                    x_start=x_start, x_t=x_t, t=t\r\n                )[0],\r\n                ModelMeanType.START_X: x_start,\r\n                ModelMeanType.EPSILON: noise,\r\n            }[self.model_mean_type]\r\n            # assert model_output.shape == target.shape == x_start.shape\r\n            if use_mask:\r\n                terms[\"mse\"] = mean_flat((target[:,:4] - model_output) ** 2)\r\n            else:\r\n                terms[\"mse\"] = mean_flat((target - model_output) ** 2)\r\n            if \"vb\" in terms:\r\n                terms[\"loss\"] = terms[\"mse\"] + terms[\"vb\"]\r\n            else:\r\n                terms[\"loss\"] = terms[\"mse\"]\r\n        else:\r\n            raise NotImplementedError(self.loss_type)\r\n\r\n        return terms\r\n\r\n    def _prior_bpd(self, x_start):\r\n        \"\"\"\r\n        Get the prior KL term for the variational lower-bound, measured in\r\n        bits-per-dim.\r\n        This term can't be optimized, as it only depends on the encoder.\r\n        :param x_start: the [N x C x ...] tensor of inputs.\r\n        :return: a batch of [N] KL values (in bits), one per batch element.\r\n        \"\"\"\r\n        batch_size = x_start.shape[0]\r\n        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)\r\n        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)\r\n        kl_prior = normal_kl(\r\n            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0\r\n        )\r\n        return mean_flat(kl_prior) / np.log(2.0)\r\n\r\n    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):\r\n        \"\"\"\r\n        Compute the entire variational lower-bound, measured in bits-per-dim,\r\n        as well as other related quantities.\r\n        :param model: the model to evaluate loss on.\r\n        :param x_start: the [N x C x ...] tensor of inputs.\r\n        :param clip_denoised: if True, clip denoised samples.\r\n        :param model_kwargs: if not None, a dict of extra keyword arguments to\r\n            pass to the model. This can be used for conditioning.\r\n        :return: a dict containing the following keys:\r\n                 - total_bpd: the total variational lower-bound, per batch element.\r\n                 - prior_bpd: the prior term in the lower-bound.\r\n                 - vb: an [N x T] tensor of terms in the lower-bound.\r\n                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.\r\n                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.\r\n        \"\"\"\r\n        device = x_start.device\r\n        batch_size = x_start.shape[0]\r\n\r\n        vb = []\r\n        xstart_mse = []\r\n        mse = []\r\n        for t in list(range(self.num_timesteps))[::-1]:\r\n            t_batch = th.tensor([t] * batch_size, device=device)\r\n            noise = th.randn_like(x_start)\r\n            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)\r\n            # Calculate VLB term at the current timestep\r\n            with th.no_grad():\r\n                out = self._vb_terms_bpd(\r\n                    model,\r\n                    x_start=x_start,\r\n                    x_t=x_t,\r\n                    t=t_batch,\r\n                    clip_denoised=clip_denoised,\r\n                    model_kwargs=model_kwargs,\r\n                )\r\n            vb.append(out[\"output\"])\r\n            xstart_mse.append(mean_flat((out[\"pred_xstart\"] - x_start) ** 2))\r\n            eps = self._predict_eps_from_xstart(x_t, t_batch, out[\"pred_xstart\"])\r\n            mse.append(mean_flat((eps - noise) ** 2))\r\n\r\n        vb = th.stack(vb, dim=1)\r\n        xstart_mse = th.stack(xstart_mse, dim=1)\r\n        mse = th.stack(mse, dim=1)\r\n\r\n        prior_bpd = self._prior_bpd(x_start)\r\n        total_bpd = vb.sum(dim=1) + prior_bpd\r\n        return {\r\n            \"total_bpd\": total_bpd,\r\n            \"prior_bpd\": prior_bpd,\r\n            \"vb\": vb,\r\n            \"xstart_mse\": xstart_mse,\r\n            \"mse\": mse,\r\n        }\r\n\r\n\r\ndef _extract_into_tensor(arr, timesteps, broadcast_shape):\r\n    \"\"\"\r\n    Extract values from a 1-D numpy array for a batch of indices.\r\n    :param arr: the 1-D numpy array.\r\n    :param timesteps: a tensor of indices into the array to extract.\r\n    :param broadcast_shape: a larger shape of K dimensions with the batch\r\n                            dimension equal to the length of timesteps.\r\n    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.\r\n    \"\"\"\r\n    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()\r\n    while len(res.shape) < len(broadcast_shape):\r\n        res = res[..., None]\r\n    return res + th.zeros(broadcast_shape, device=timesteps.device)\r\n"
  },
  {
    "path": "diffusion/respace.py",
    "content": "# Modified from OpenAI's diffusion repos\r\n#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py\r\n#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion\r\n#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py\r\nimport torch\r\nimport numpy as np\r\nimport torch as th\r\n\r\nfrom .gaussian_diffusion import GaussianDiffusion\r\n\r\n\r\ndef space_timesteps(num_timesteps, section_counts):\r\n    \"\"\"\r\n    Create a list of timesteps to use from an original diffusion process,\r\n    given the number of timesteps we want to take from equally-sized portions\r\n    of the original process.\r\n    For example, if there's 300 timesteps and the section counts are [10,15,20]\r\n    then the first 100 timesteps are strided to be 10 timesteps, the second 100\r\n    are strided to be 15 timesteps, and the final 100 are strided to be 20.\r\n    If the stride is a string starting with \"ddim\", then the fixed striding\r\n    from the DDIM paper is used, and only one section is allowed.\r\n    :param num_timesteps: the number of diffusion steps in the original\r\n                          process to divide up.\r\n    :param section_counts: either a list of numbers, or a string containing\r\n                           comma-separated numbers, indicating the step count\r\n                           per section. As a special case, use \"ddimN\" where N\r\n                           is a number of steps to use the striding from the\r\n                           DDIM paper.\r\n    :return: a set of diffusion steps from the original process to use.\r\n    \"\"\"\r\n    if isinstance(section_counts, str):\r\n        if section_counts.startswith(\"ddim\"):\r\n            desired_count = int(section_counts[len(\"ddim\") :])\r\n            for i in range(1, num_timesteps):\r\n                if len(range(0, num_timesteps, i)) == desired_count:\r\n                    return set(range(0, num_timesteps, i))\r\n            raise ValueError(\r\n                f\"cannot create exactly {num_timesteps} steps with an integer stride\"\r\n            )\r\n        section_counts = [int(x) for x in section_counts.split(\",\")]\r\n    size_per = num_timesteps // len(section_counts)\r\n    extra = num_timesteps % len(section_counts)\r\n    start_idx = 0\r\n    all_steps = []\r\n    for i, section_count in enumerate(section_counts):\r\n        size = size_per + (1 if i < extra else 0)\r\n        if size < section_count:\r\n            raise ValueError(\r\n                f\"cannot divide section of {size} steps into {section_count}\"\r\n            )\r\n        if section_count <= 1:\r\n            frac_stride = 1\r\n        else:\r\n            frac_stride = (size - 1) / (section_count - 1)\r\n        cur_idx = 0.0\r\n        taken_steps = []\r\n        for _ in range(section_count):\r\n            taken_steps.append(start_idx + round(cur_idx))\r\n            cur_idx += frac_stride\r\n        all_steps += taken_steps\r\n        start_idx += size\r\n    return set(all_steps)\r\n\r\n\r\nclass SpacedDiffusion(GaussianDiffusion):\r\n    \"\"\"\r\n    A diffusion process which can skip steps in a base diffusion process.\r\n    :param use_timesteps: a collection (sequence or set) of timesteps from the\r\n                          original diffusion process to retain.\r\n    :param kwargs: the kwargs to create the base diffusion process.\r\n    \"\"\"\r\n\r\n    def __init__(self, use_timesteps, **kwargs):\r\n        self.use_timesteps = set(use_timesteps)\r\n        self.timestep_map = []\r\n        self.original_num_steps = len(kwargs[\"betas\"])\r\n\r\n        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa\r\n        last_alpha_cumprod = 1.0\r\n        new_betas = []\r\n        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):\r\n            if i in self.use_timesteps:\r\n                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)\r\n                last_alpha_cumprod = alpha_cumprod\r\n                self.timestep_map.append(i)\r\n        kwargs[\"betas\"] = np.array(new_betas)\r\n        super().__init__(**kwargs)\r\n\r\n    def p_mean_variance(\r\n        self, model, *args, **kwargs\r\n    ):  # pylint: disable=signature-differs\r\n        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)\r\n\r\n    # @torch.compile\r\n    def training_losses(\r\n        self, model, *args, **kwargs\r\n    ):  # pylint: disable=signature-differs\r\n        return super().training_losses(self._wrap_model(model), *args, **kwargs)\r\n\r\n    def condition_mean(self, cond_fn, *args, **kwargs):\r\n        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)\r\n\r\n    def condition_score(self, cond_fn, *args, **kwargs):\r\n        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)\r\n\r\n    def _wrap_model(self, model):\r\n        if isinstance(model, _WrappedModel):\r\n            return model\r\n        return _WrappedModel(\r\n            model, self.timestep_map, self.original_num_steps\r\n        )\r\n\r\n    def _scale_timesteps(self, t):\r\n        # Scaling is done by the wrapped model.\r\n        return t\r\n\r\n\r\nclass _WrappedModel:\r\n    def __init__(self, model, timestep_map, original_num_steps):\r\n        self.model = model\r\n        self.timestep_map = timestep_map\r\n        # self.rescale_timesteps = rescale_timesteps\r\n        self.original_num_steps = original_num_steps\r\n\r\n    def __call__(self, x, ts, **kwargs):\r\n        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)\r\n        new_ts = map_tensor[ts]\r\n        # if self.rescale_timesteps:\r\n        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)\r\n        return self.model(x, new_ts, **kwargs)\r\n"
  },
  {
    "path": "diffusion/timestep_sampler.py",
    "content": "# Modified from OpenAI's diffusion repos\r\n#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py\r\n#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion\r\n#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py\r\n\r\nfrom abc import ABC, abstractmethod\r\n\r\nimport numpy as np\r\nimport torch as th\r\nimport torch.distributed as dist\r\n\r\n\r\ndef create_named_schedule_sampler(name, diffusion):\r\n    \"\"\"\r\n    Create a ScheduleSampler from a library of pre-defined samplers.\r\n    :param name: the name of the sampler.\r\n    :param diffusion: the diffusion object to sample for.\r\n    \"\"\"\r\n    if name == \"uniform\":\r\n        return UniformSampler(diffusion)\r\n    elif name == \"loss-second-moment\":\r\n        return LossSecondMomentResampler(diffusion)\r\n    else:\r\n        raise NotImplementedError(f\"unknown schedule sampler: {name}\")\r\n\r\n\r\nclass ScheduleSampler(ABC):\r\n    \"\"\"\r\n    A distribution over timesteps in the diffusion process, intended to reduce\r\n    variance of the objective.\r\n    By default, samplers perform unbiased importance sampling, in which the\r\n    objective's mean is unchanged.\r\n    However, subclasses may override sample() to change how the resampled\r\n    terms are reweighted, allowing for actual changes in the objective.\r\n    \"\"\"\r\n\r\n    @abstractmethod\r\n    def weights(self):\r\n        \"\"\"\r\n        Get a numpy array of weights, one per diffusion step.\r\n        The weights needn't be normalized, but must be positive.\r\n        \"\"\"\r\n\r\n    def sample(self, batch_size, device):\r\n        \"\"\"\r\n        Importance-sample timesteps for a batch.\r\n        :param batch_size: the number of timesteps.\r\n        :param device: the torch device to save to.\r\n        :return: a tuple (timesteps, weights):\r\n                 - timesteps: a tensor of timestep indices.\r\n                 - weights: a tensor of weights to scale the resulting losses.\r\n        \"\"\"\r\n        w = self.weights()\r\n        p = w / np.sum(w)\r\n        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)\r\n        indices = th.from_numpy(indices_np).long().to(device)\r\n        weights_np = 1 / (len(p) * p[indices_np])\r\n        weights = th.from_numpy(weights_np).float().to(device)\r\n        return indices, weights\r\n\r\n\r\nclass UniformSampler(ScheduleSampler):\r\n    def __init__(self, diffusion):\r\n        self.diffusion = diffusion\r\n        self._weights = np.ones([diffusion.num_timesteps])\r\n\r\n    def weights(self):\r\n        return self._weights\r\n\r\n\r\nclass LossAwareSampler(ScheduleSampler):\r\n    def update_with_local_losses(self, local_ts, local_losses):\r\n        \"\"\"\r\n        Update the reweighting using losses from a model.\r\n        Call this method from each rank with a batch of timesteps and the\r\n        corresponding losses for each of those timesteps.\r\n        This method will perform synchronization to make sure all of the ranks\r\n        maintain the exact same reweighting.\r\n        :param local_ts: an integer Tensor of timesteps.\r\n        :param local_losses: a 1D Tensor of losses.\r\n        \"\"\"\r\n        batch_sizes = [\r\n            th.tensor([0], dtype=th.int32, device=local_ts.device)\r\n            for _ in range(dist.get_world_size())\r\n        ]\r\n        dist.all_gather(\r\n            batch_sizes,\r\n            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),\r\n        )\r\n\r\n        # Pad all_gather batches to be the maximum batch size.\r\n        batch_sizes = [x.item() for x in batch_sizes]\r\n        max_bs = max(batch_sizes)\r\n\r\n        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]\r\n        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]\r\n        dist.all_gather(timestep_batches, local_ts)\r\n        dist.all_gather(loss_batches, local_losses)\r\n        timesteps = [\r\n            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]\r\n        ]\r\n        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]\r\n        self.update_with_all_losses(timesteps, losses)\r\n\r\n    @abstractmethod\r\n    def update_with_all_losses(self, ts, losses):\r\n        \"\"\"\r\n        Update the reweighting using losses from a model.\r\n        Sub-classes should override this method to update the reweighting\r\n        using losses from the model.\r\n        This method directly updates the reweighting without synchronizing\r\n        between workers. It is called by update_with_local_losses from all\r\n        ranks with identical arguments. Thus, it should have deterministic\r\n        behavior to maintain state across workers.\r\n        :param ts: a list of int timesteps.\r\n        :param losses: a list of float losses, one per timestep.\r\n        \"\"\"\r\n\r\n\r\nclass LossSecondMomentResampler(LossAwareSampler):\r\n    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):\r\n        self.diffusion = diffusion\r\n        self.history_per_term = history_per_term\r\n        self.uniform_prob = uniform_prob\r\n        self._loss_history = np.zeros(\r\n            [diffusion.num_timesteps, history_per_term], dtype=np.float64\r\n        )\r\n        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)\r\n\r\n    def weights(self):\r\n        if not self._warmed_up():\r\n            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)\r\n        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))\r\n        weights /= np.sum(weights)\r\n        weights *= 1 - self.uniform_prob\r\n        weights += self.uniform_prob / len(weights)\r\n        return weights\r\n\r\n    def update_with_all_losses(self, ts, losses):\r\n        for t, loss in zip(ts, losses):\r\n            if self._loss_counts[t] == self.history_per_term:\r\n                # Shift out the oldest loss term.\r\n                self._loss_history[t, :-1] = self._loss_history[t, 1:]\r\n                self._loss_history[t, -1] = loss\r\n            else:\r\n                self._loss_history[t, self._loss_counts[t]] = loss\r\n                self._loss_counts[t] += 1\r\n\r\n    def _warmed_up(self):\r\n        return (self._loss_counts == self.history_per_term).all()\r\n"
  },
  {
    "path": "models/__init__.py",
    "content": "import os\r\nimport sys\r\nsys.path.append(os.path.split(sys.path[0])[0])\r\n\r\nfrom .unet import UNet3DConditionModel\r\nfrom torch.optim.lr_scheduler import LambdaLR\r\n\r\ndef customized_lr_scheduler(optimizer, warmup_steps=5000): # 5000 from u-vit\r\n    from torch.optim.lr_scheduler import LambdaLR\r\n    def fn(step):\r\n        if warmup_steps > 0:\r\n            return min(step / warmup_steps, 1)\r\n        else:\r\n            return 1\r\n    return LambdaLR(optimizer, fn)\r\n\r\n\r\ndef get_lr_scheduler(optimizer, name, **kwargs):\r\n    if name == 'warmup':\r\n        return customized_lr_scheduler(optimizer, **kwargs)\r\n    elif name == 'cosine':\r\n        from torch.optim.lr_scheduler import CosineAnnealingLR\r\n        return CosineAnnealingLR(optimizer, **kwargs)\r\n    else:\r\n        raise NotImplementedError(name)\r\n    \r\ndef get_models(args):\r\n    if 'UNet' in args.model:\r\n        pretrained_model_path = args.pretrained_model_path\r\n        return UNet3DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder=\"unet\", use_concat=args.use_mask)\r\n    else:\r\n        raise '{} Model Not Supported!'.format(args.model)\r\n    "
  },
  {
    "path": "models/attention.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py\nimport os\nimport sys\nsys.path.append(os.path.split(sys.path[0])[0])\nfrom dataclasses import dataclass\nfrom typing import Optional\n\nimport math\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\nfrom copy import deepcopy\nfrom diffusers.configuration_utils import ConfigMixin, register_to_config\nfrom diffusers.utils import BaseOutput\nfrom diffusers.utils.import_utils import is_xformers_available\nfrom diffusers.models.attention import FeedForward, AdaLayerNorm\nfrom rotary_embedding_torch import RotaryEmbedding\nfrom typing import Callable, Optional\nfrom einops import rearrange, repeat\n\ntry:\n    from diffusers.models.modeling_utils import ModelMixin\nexcept:\n    from diffusers.modeling_utils import ModelMixin # 0.11.1\n\n\n@dataclass\nclass Transformer3DModelOutput(BaseOutput):\n    sample: torch.FloatTensor\n\n\nif is_xformers_available():\n    import xformers\n    import xformers.ops\nelse:\n    xformers = None\n\ndef exists(x):\n    return x is not None\n\n\nclass CrossAttention(nn.Module):\n    r\"\"\"\n    copy from diffuser 0.11.1\n    A cross attention layer.\n    Parameters:\n        query_dim (`int`): The number of channels in the query.\n        cross_attention_dim (`int`, *optional*):\n            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.\n        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.\n        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.\n        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.\n        bias (`bool`, *optional*, defaults to False):\n            Set to `True` for the query, key, and value linear layers to contain a bias parameter.\n    \"\"\"\n\n    def __init__(\n        self,\n        query_dim: int,\n        cross_attention_dim: Optional[int] = None,\n        heads: int = 8,\n        dim_head: int = 64,\n        dropout: float = 0.0,\n        bias=False,\n        upcast_attention: bool = False,\n        upcast_softmax: bool = False,\n        added_kv_proj_dim: Optional[int] = None,\n        norm_num_groups: Optional[int] = None,\n        use_relative_position: bool = False,\n    ):\n        super().__init__()\n        # print('num head', heads)\n        inner_dim = dim_head * heads\n        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim\n        self.upcast_attention = upcast_attention\n        self.upcast_softmax = upcast_softmax\n\n        self.scale = dim_head**-0.5\n\n        self.heads = heads\n        self.dim_head = dim_head\n        # for slice_size > 0 the attention score computation\n        # is split across the batch axis to save memory\n        # You can set slice_size with `set_attention_slice`\n        self.sliceable_head_dim = heads\n        self._slice_size = None\n        self._use_memory_efficient_attention_xformers = False\n        self.added_kv_proj_dim = added_kv_proj_dim\n\n        if norm_num_groups is not None:\n            self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)\n        else:\n            self.group_norm = None\n\n        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)\n        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)\n        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)\n\n        if self.added_kv_proj_dim is not None:\n            self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)\n            self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)\n\n        self.to_out = nn.ModuleList([])\n        self.to_out.append(nn.Linear(inner_dim, query_dim))\n        self.to_out.append(nn.Dropout(dropout))\n\n        # print(use_relative_position)\n        self.use_relative_position = use_relative_position\n        if self.use_relative_position:\n            self.rotary_emb = RotaryEmbedding(min(32, dim_head))\n\n        self.ip_transformed = False\n        self.ip_scale = 1\n    \n    def ip_transform(self):\n        if self.ip_transformed is not True:\n            self.ip_to_k = deepcopy(self.to_k).to(next(self.parameters()).device)\n            self.ip_to_v = deepcopy(self.to_v).to(next(self.parameters()).device)\n            self.ip_transformed = True\n\n    def ip_train_set(self):\n        if self.ip_transformed is True:\n            self.ip_to_k.requires_grad_(True)\n            self.ip_to_v.requires_grad_(True)\n\n    def set_scale(self, scale):\n        self.ip_scale = scale\n\n    def reshape_heads_to_batch_dim(self, tensor):\n        batch_size, seq_len, dim = tensor.shape\n        head_size = self.heads\n        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)\n        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)\n        return tensor\n\n    def reshape_batch_dim_to_heads(self, tensor):\n        batch_size, seq_len, dim = tensor.shape\n        head_size = self.heads\n        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)\n        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)\n        return tensor\n    \n    def reshape_for_scores(self, tensor):\n        # split heads and dims\n        # tensor should be [b (h w)] f (d nd)\n        batch_size, seq_len, dim = tensor.shape\n        head_size = self.heads\n        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)\n        tensor = tensor.permute(0, 2, 1, 3).contiguous()\n        return tensor\n    \n    def same_batch_dim_to_heads(self, tensor):\n        batch_size, head_size, seq_len, dim = tensor.shape # [b (h w)] nd f d\n        tensor = tensor.reshape(batch_size, seq_len, dim * head_size)\n        return tensor\n\n    def set_attention_slice(self, slice_size):\n        if slice_size is not None and slice_size > self.sliceable_head_dim:\n            raise ValueError(f\"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.\")\n\n        self._slice_size = slice_size\n\n    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, use_image_num=None, ip_hidden_states=None):\n        batch_size, sequence_length, _ = hidden_states.shape\n\n        encoder_hidden_states = encoder_hidden_states\n\n        if self.group_norm is not None:\n            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)\n\n        query = self.to_q(hidden_states) # [b (h w)] f (nd * d)\n\n        dim = query.shape[-1]\n        if not self.use_relative_position:\n            query = self.reshape_heads_to_batch_dim(query) # [b (h w) nd] f d\n\n        if self.added_kv_proj_dim is not None:\n            key = self.to_k(hidden_states)\n            value = self.to_v(hidden_states)\n            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)\n            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)\n\n            key = self.reshape_heads_to_batch_dim(key)\n            value = self.reshape_heads_to_batch_dim(value)\n            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)\n            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)\n\n            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)\n            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)\n        else:\n            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states\n            key = self.to_k(encoder_hidden_states)\n            value = self.to_v(encoder_hidden_states)\n            \n            if not self.use_relative_position:\n                key = self.reshape_heads_to_batch_dim(key)\n                value = self.reshape_heads_to_batch_dim(value)\n\n        if self.ip_transformed is True and ip_hidden_states is not None:\n            # print(ip_hidden_states.dtype)\n            # print(self.ip_to_k.weight.dtype)\n            ip_key = self.ip_to_k(ip_hidden_states)\n            ip_value = self.ip_to_v(ip_hidden_states)\n\n            if not self.use_relative_position:\n                ip_key = self.reshape_heads_to_batch_dim(ip_key)\n                ip_value = self.reshape_heads_to_batch_dim(ip_value)\n\n        if attention_mask is not None:\n            if attention_mask.shape[-1] != query.shape[1]:\n                target_length = query.shape[1]\n                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)\n                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)\n\n        # attention, what we cannot get enough of\n        if self._use_memory_efficient_attention_xformers:\n            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)\n            # Some versions of xformers return output in fp32, cast it back to the dtype of the input\n            hidden_states = hidden_states.to(query.dtype)\n\n            if self.ip_transformed is True and ip_hidden_states is not None:\n                ip_hidden_states = self._memory_efficient_attention_xformers(query, ip_key, ip_value, attention_mask)\n                ip_hidden_states = ip_hidden_states.to(query.dtype)\n\n        else:\n            if self._slice_size is None or query.shape[0] // self._slice_size == 1:\n                hidden_states = self._attention(query, key, value, attention_mask)\n\n                if self.ip_transformed is True and ip_hidden_states is not None:\n                    ip_hidden_states = self._attention(query, ip_key, ip_value, attention_mask)\n            else:\n                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)\n\n                if self.ip_transformed is True and ip_hidden_states is not None:\n                    ip_hidden_states = self._sliced_attention(query, ip_key, ip_value, sequence_length, dim, attention_mask)\n\n        if self.ip_transformed is True and ip_hidden_states is not None:\n            hidden_states = hidden_states + self.ip_scale * ip_hidden_states\n\n        # linear proj\n        hidden_states = self.to_out[0](hidden_states)\n\n        # dropout\n        hidden_states = self.to_out[1](hidden_states)\n        return hidden_states\n\n\n    def _attention(self, query, key, value, attention_mask=None):\n        if self.upcast_attention:\n            query = query.float()\n            key = key.float()\n\n        attention_scores = torch.baddbmm(\n            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),\n            query,\n            key.transpose(-1, -2),\n            beta=0,\n            alpha=self.scale,\n        )\n\n        if attention_mask is not None:\n            attention_scores = attention_scores + attention_mask\n\n        if self.upcast_softmax:\n            attention_scores = attention_scores.float()\n\n        attention_probs = attention_scores.softmax(dim=-1)\n        attention_probs = attention_probs.to(value.dtype)\n        hidden_states = torch.bmm(attention_probs, value)\n        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)\n        return hidden_states\n\n    def _sliced_attention(self, query, key, value, sequence_length, dim, attention_mask):\n        batch_size_attention = query.shape[0]\n        hidden_states = torch.zeros(\n            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype\n        )\n        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]\n        for i in range(hidden_states.shape[0] // slice_size):\n            start_idx = i * slice_size\n            end_idx = (i + 1) * slice_size\n\n            query_slice = query[start_idx:end_idx]\n            key_slice = key[start_idx:end_idx]\n\n            if self.upcast_attention:\n                query_slice = query_slice.float()\n                key_slice = key_slice.float()\n\n            attn_slice = torch.baddbmm(\n                torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query_slice.dtype, device=query.device),\n                query_slice,\n                key_slice.transpose(-1, -2),\n                beta=0,\n                alpha=self.scale,\n            )\n\n            if attention_mask is not None:\n                attn_slice = attn_slice + attention_mask[start_idx:end_idx]\n\n            if self.upcast_softmax:\n                attn_slice = attn_slice.float()\n\n            attn_slice = attn_slice.softmax(dim=-1)\n\n            # cast back to the original dtype\n            attn_slice = attn_slice.to(value.dtype)\n            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])\n\n            hidden_states[start_idx:end_idx] = attn_slice\n\n        # reshape hidden_states\n        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)\n        return hidden_states\n\n    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):\n        # TODO attention_mask\n        query = query.contiguous()\n        key = key.contiguous()\n        value = value.contiguous()\n        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)\n        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)\n        return hidden_states\n\n\nclass Transformer3DModel(ModelMixin, ConfigMixin):\n    @register_to_config\n    def __init__(\n        self,\n        num_attention_heads: int = 16,\n        attention_head_dim: int = 88,\n        in_channels: Optional[int] = None,\n        num_layers: int = 1,\n        dropout: float = 0.0,\n        norm_num_groups: int = 32,\n        cross_attention_dim: Optional[int] = None,\n        attention_bias: bool = False,\n        activation_fn: str = \"geglu\",\n        num_embeds_ada_norm: Optional[int] = None,\n        use_linear_projection: bool = False,\n        only_cross_attention: bool = False,\n        upcast_attention: bool = False,\n        use_first_frame: bool = False,\n        use_relative_position: bool = False,\n        rotary_emb: bool = None,\n    ):\n        super().__init__()\n        self.use_linear_projection = use_linear_projection\n        self.num_attention_heads = num_attention_heads\n        self.attention_head_dim = attention_head_dim\n        inner_dim = num_attention_heads * attention_head_dim\n\n        # Define input layers\n        self.in_channels = in_channels\n\n        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)\n        if use_linear_projection:\n            self.proj_in = nn.Linear(in_channels, inner_dim)\n        else:\n            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)\n\n        # Define transformers blocks\n        self.transformer_blocks = nn.ModuleList(\n            [\n                BasicTransformerBlock(\n                    inner_dim,\n                    num_attention_heads,\n                    attention_head_dim,\n                    dropout=dropout,\n                    cross_attention_dim=cross_attention_dim,\n                    activation_fn=activation_fn,\n                    num_embeds_ada_norm=num_embeds_ada_norm,\n                    attention_bias=attention_bias,\n                    only_cross_attention=only_cross_attention,\n                    upcast_attention=upcast_attention,\n                    use_first_frame=use_first_frame,\n                    use_relative_position=use_relative_position,\n                    rotary_emb=rotary_emb,\n                )\n                for d in range(num_layers)\n            ]\n        )\n\n        # 4. Define output layers\n        if use_linear_projection:\n            self.proj_out = nn.Linear(in_channels, inner_dim)\n        else:\n            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)\n\n    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, use_image_num=None, return_dict: bool = True, ip_hidden_states=None, encoder_temporal_hidden_states=None):\n        # Input\n        # if ip_hidden_states is not None:\n        #     ip_hidden_states = ip_hidden_states.to(dtype=encoder_hidden_states.dtype)\n            # print(ip_hidden_states.shape)\n            # print(encoder_hidden_states.shape)\n        assert hidden_states.dim() == 5, f\"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}.\"\n        if self.training:\n            video_length = hidden_states.shape[2] - use_image_num\n            hidden_states = rearrange(hidden_states, \"b c f h w -> (b f) c h w\").contiguous()\n            encoder_hidden_states_length = encoder_hidden_states.shape[1]\n            encoder_hidden_states_video = encoder_hidden_states[:, :encoder_hidden_states_length - use_image_num, ...]\n            encoder_hidden_states_video = repeat(encoder_hidden_states_video, 'b m n c -> b (m f) n c', f=video_length).contiguous()\n            encoder_hidden_states_image = encoder_hidden_states[:, encoder_hidden_states_length - use_image_num:, ...]\n            encoder_hidden_states = torch.cat([encoder_hidden_states_video, encoder_hidden_states_image], dim=1)\n            encoder_hidden_states = rearrange(encoder_hidden_states, 'b m n c -> (b m) n c').contiguous()\n\n            if ip_hidden_states is not None:\n                ip_hidden_states_length = ip_hidden_states.shape[1]\n                ip_hidden_states_video = ip_hidden_states[:, :ip_hidden_states_length - use_image_num, ...]\n                ip_hidden_states_video = repeat(ip_hidden_states_video, 'b m n c -> b (m f) n c', f=video_length).contiguous()\n                ip_hidden_states_image = ip_hidden_states[:, ip_hidden_states_length - use_image_num:, ...]\n                ip_hidden_states = torch.cat([ip_hidden_states_video, ip_hidden_states_image], dim=1)\n                ip_hidden_states = rearrange(ip_hidden_states, 'b m n c -> (b m) n c').contiguous()\n\n        else:\n            video_length = hidden_states.shape[2]\n            hidden_states = rearrange(hidden_states, \"b c f h w -> (b f) c h w\").contiguous()\n            encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length).contiguous()\n            \n            if encoder_temporal_hidden_states is not None:\n                encoder_temporal_hidden_states = repeat(encoder_temporal_hidden_states, 'b n c -> (b f) n c', f=video_length).contiguous()\n\n            if ip_hidden_states is not None:\n                ip_hidden_states = repeat(ip_hidden_states, 'b 1 n c -> (b f) n c', f=video_length).contiguous()\n\n        batch, channel, height, weight = hidden_states.shape\n        residual = hidden_states\n\n        hidden_states = self.norm(hidden_states)\n        if not self.use_linear_projection:\n            hidden_states = self.proj_in(hidden_states)\n            inner_dim = hidden_states.shape[1]\n            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)\n        else:\n            inner_dim = hidden_states.shape[1]\n            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)\n            hidden_states = self.proj_in(hidden_states)\n\n        # Blocks\n        for block in self.transformer_blocks:\n            hidden_states = block(\n                hidden_states,\n                encoder_hidden_states=encoder_hidden_states,\n                timestep=timestep,\n                video_length=video_length,\n                use_image_num=use_image_num,\n                ip_hidden_states=ip_hidden_states,\n                encoder_temporal_hidden_states=encoder_temporal_hidden_states\n            )\n\n        # Output\n        if not self.use_linear_projection:\n            hidden_states = (\n                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()\n            )\n            hidden_states = self.proj_out(hidden_states)\n        else:\n            hidden_states = self.proj_out(hidden_states)\n            hidden_states = (\n                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()\n            )\n\n        output = hidden_states + residual\n\n        output = rearrange(output, \"(b f) c h w -> b c f h w\", f=video_length + use_image_num).contiguous()\n        if not return_dict:\n            return (output,)\n\n        return Transformer3DModelOutput(sample=output)\n\n\nclass BasicTransformerBlock(nn.Module):\n    def __init__(\n        self,\n        dim: int,\n        num_attention_heads: int,\n        attention_head_dim: int,\n        dropout=0.0,\n        cross_attention_dim: Optional[int] = None,\n        activation_fn: str = \"geglu\",\n        num_embeds_ada_norm: Optional[int] = None,\n        attention_bias: bool = False,\n        only_cross_attention: bool = False,\n        upcast_attention: bool = False,\n        use_first_frame: bool = False,\n        use_relative_position: bool = False,\n        rotary_emb: bool = False,\n    ):\n        super().__init__()\n        self.only_cross_attention = only_cross_attention\n        # print(only_cross_attention)\n        self.use_ada_layer_norm = num_embeds_ada_norm is not None\n        # print(self.use_ada_layer_norm)\n        self.use_first_frame = use_first_frame\n\n        self.dim = dim\n        self.cross_attention_dim = cross_attention_dim\n        self.num_attention_heads = num_attention_heads\n        self.attention_head_dim = attention_head_dim\n        self.dropout = dropout\n        self.attention_bias = attention_bias\n        self.upcast_attention = upcast_attention\n\n        # Spatial-Attn\n        self.attn1 = CrossAttention(\n            query_dim=dim,\n            heads=num_attention_heads,\n            dim_head=attention_head_dim,\n            dropout=dropout,\n            bias=attention_bias,\n            cross_attention_dim=None,\n            upcast_attention=upcast_attention,\n        )\n        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)\n\n        # Text Cross-Attn\n        if cross_attention_dim is not None:\n            self.attn2 = CrossAttention(\n                query_dim=dim,\n                cross_attention_dim=cross_attention_dim,\n                heads=num_attention_heads,\n                dim_head=attention_head_dim,\n                dropout=dropout,\n                bias=attention_bias,\n                upcast_attention=upcast_attention,\n            )\n        else:\n            self.attn2 = None\n\n        if cross_attention_dim is not None:\n            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)\n        else:\n            self.norm2 = None\n\n        # Temp\n        self.attn_temp = TemporalAttention(\n                query_dim=dim,\n                heads=num_attention_heads,\n                dim_head=attention_head_dim,\n                dropout=dropout,\n                bias=attention_bias,\n                cross_attention_dim=None,\n                upcast_attention=upcast_attention,\n                rotary_emb=rotary_emb,\n            )\n        self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)\n        nn.init.zeros_(self.attn_temp.to_out[0].weight.data)\n\n        # Feed-forward\n        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)\n        self.norm3 = nn.LayerNorm(dim)\n\n        self.tca_transformed = False\n\n    def tca_transform(self):\n        if self.tca_transformed is not True:\n            self.cross_attn_temp = CrossAttention(\n                    query_dim=self.dim * 16,\n                    cross_attention_dim=self.cross_attention_dim,\n                    heads=self.num_attention_heads,\n                    dim_head=self.attention_head_dim,\n                    dropout=self.dropout,\n                    bias=self.attention_bias,\n                    upcast_attention=self.upcast_attention,\n            )\n            self.cross_norm_temp = AdaLayerNorm(self.dim * 16, self.num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(self.dim * 16)\n            nn.init.zeros_(self.cross_attn_temp.to_out[0].weight.data)\n            self.tca_transformed = True\n\n    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool, op=None):\n\n        if not is_xformers_available():\n            print(\"Here is how to install it\")\n            raise ModuleNotFoundError(\n                \"Refer to https://github.com/facebookresearch/xformers for more information on how to install\"\n                \" xformers\",\n                name=\"xformers\",\n            )\n        elif not torch.cuda.is_available():\n            raise ValueError(\n                \"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only\"\n                \" available for GPU \"\n            )\n        else:\n            try:\n                # Make sure we can run the memory efficient attention\n                _ = xformers.ops.memory_efficient_attention(\n                    torch.randn((1, 2, 40), device=\"cuda\"),\n                    torch.randn((1, 2, 40), device=\"cuda\"),\n                    torch.randn((1, 2, 40), device=\"cuda\"),\n                )\n            except Exception as e:\n                raise e\n            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers\n            if self.attn2 is not None:\n                self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers\n\n    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None, use_image_num=None, ip_hidden_states=None, encoder_temporal_hidden_states=None):\n        # SparseCausal-Attention\n        norm_hidden_states = (\n            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)\n        )\n\n        if self.only_cross_attention:\n            hidden_states = (\n                self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states\n            )\n        else:\n            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, use_image_num=use_image_num) + hidden_states\n\n        if self.attn2 is not None:\n            # Cross-Attention\n            norm_hidden_states = (\n                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)\n            )\n            hidden_states = (\n                self.attn2(\n                    norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, ip_hidden_states=ip_hidden_states\n                )\n                + hidden_states\n            )\n\n        # Temporal Attention\n        if self.training:\n            d = hidden_states.shape[1]\n            hidden_states = rearrange(hidden_states, \"(b f) d c -> (b d) f c\", f=video_length + use_image_num).contiguous()\n            hidden_states_video = hidden_states[:, :video_length, :]\n            hidden_states_image = hidden_states[:, video_length:, :]\n            norm_hidden_states_video = (\n                self.norm_temp(hidden_states_video, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states_video)\n            )\n            hidden_states_video = self.attn_temp(norm_hidden_states_video) + hidden_states_video\n            \n            # Temporal Cross Attention\n            if self.tca_transformed is True:\n                hidden_states_video = rearrange(hidden_states_video, \"(b d) f c -> b d (f c)\", d=d).contiguous()\n                norm_hidden_states_video = (\n                    self.cross_norm_temp(hidden_states_video, timestep) if self.use_ada_layer_norm else self.cross_norm_temp(hidden_states_video)\n                )\n                temp_encoder_hidden_states = rearrange(encoder_hidden_states, \"(b f) d c -> b f d c\", f=video_length + use_image_num).contiguous()\n                temp_encoder_hidden_states = temp_encoder_hidden_states[:, 0:1].squeeze(dim=1)\n                hidden_states_video = self.cross_attn_temp(norm_hidden_states_video, encoder_hidden_states=temp_encoder_hidden_states, attention_mask=attention_mask) + hidden_states_video\n                hidden_states_video = rearrange(hidden_states_video, \"b d (f c) -> (b d) f c\", f=video_length).contiguous()\n\n            hidden_states = torch.cat([hidden_states_video, hidden_states_image], dim=1)\n            hidden_states = rearrange(hidden_states, \"(b d) f c -> (b f) d c\", d=d).contiguous()\n        else:\n            d = hidden_states.shape[1]\n            hidden_states = rearrange(hidden_states, \"(b f) d c -> (b d) f c\", f=video_length + use_image_num).contiguous()\n            norm_hidden_states = (\n                self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)\n            )\n            hidden_states = self.attn_temp(norm_hidden_states) + hidden_states\n\n            # Temporal Cross Attention\n            if self.tca_transformed is True:\n                hidden_states = rearrange(hidden_states, \"(b d) f c -> b d (f c)\", d=d).contiguous()\n                norm_hidden_states = (\n                    self.cross_norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.cross_norm_temp(hidden_states)\n                )\n                if encoder_temporal_hidden_states is not None:\n                    encoder_hidden_states = encoder_temporal_hidden_states\n                temp_encoder_hidden_states = rearrange(encoder_hidden_states, \"(b f) d c -> b f d c\", f=video_length + use_image_num).contiguous()\n                temp_encoder_hidden_states = temp_encoder_hidden_states[:, 0:1].squeeze(dim=1)\n                hidden_states = self.cross_attn_temp(norm_hidden_states, encoder_hidden_states=temp_encoder_hidden_states, attention_mask=attention_mask) + hidden_states\n                hidden_states = rearrange(hidden_states, \"b d (f c) -> (b f) d c\", f=video_length + use_image_num, d=d).contiguous()\n            else:\n                hidden_states = rearrange(hidden_states, \"(b d) f c -> (b f) d c\", d=d).contiguous()\n\n        # Feed-forward\n        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states\n        \n        return hidden_states\n\n\nclass SparseCausalAttention(CrossAttention):\n    def forward_video(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):\n        batch_size, sequence_length, _ = hidden_states.shape\n\n        encoder_hidden_states = encoder_hidden_states\n\n        if self.group_norm is not None:\n            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)\n\n        query = self.to_q(hidden_states)\n        dim = query.shape[-1]\n        query = self.reshape_heads_to_batch_dim(query)\n\n        if self.added_kv_proj_dim is not None:\n            raise NotImplementedError\n\n        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states\n        key = self.to_k(encoder_hidden_states)\n        value = self.to_v(encoder_hidden_states)\n\n        former_frame_index = torch.arange(video_length) - 1\n        former_frame_index[0] = 0\n\n        key = rearrange(key, \"(b f) d c -> b f d c\", f=video_length).contiguous()\n        key = torch.cat([key[:, [0] * video_length], key[:, former_frame_index]], dim=2)\n        key = rearrange(key, \"b f d c -> (b f) d c\").contiguous()\n\n        value = rearrange(value, \"(b f) d c -> b f d c\", f=video_length).contiguous()\n        value = torch.cat([value[:, [0] * video_length], value[:, former_frame_index]], dim=2)\n        value = rearrange(value, \"b f d c -> (b f) d c\").contiguous()\n\n        key = self.reshape_heads_to_batch_dim(key)\n        value = self.reshape_heads_to_batch_dim(value)\n\n        if attention_mask is not None:\n            if attention_mask.shape[-1] != query.shape[1]:\n                target_length = query.shape[1]\n                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)\n                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)\n\n        # attention, what we cannot get enough of\n        if self._use_memory_efficient_attention_xformers:\n            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)\n            # Some versions of xformers return output in fp32, cast it back to the dtype of the input\n            hidden_states = hidden_states.to(query.dtype)\n        else:\n            if self._slice_size is None or query.shape[0] // self._slice_size == 1:\n                hidden_states = self._attention(query, key, value, attention_mask)\n            else:\n                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)\n\n        # linear proj\n        hidden_states = self.to_out[0](hidden_states)\n\n        # dropout\n        hidden_states = self.to_out[1](hidden_states)\n        return hidden_states\n    \n    def forward_image(self, hidden_states, encoder_hidden_states=None, attention_mask=None, use_image_num=None):\n        batch_size, sequence_length, _ = hidden_states.shape\n\n        encoder_hidden_states = encoder_hidden_states\n\n        if self.group_norm is not None:\n            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)\n\n        query = self.to_q(hidden_states) # [b (h w)] f (nd * d)\n        dim = query.shape[-1]\n        if not self.use_relative_position:\n            query = self.reshape_heads_to_batch_dim(query) # [b (h w) nd] f d\n\n        if self.added_kv_proj_dim is not None:\n            key = self.to_k(hidden_states)\n            value = self.to_v(hidden_states)\n            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)\n            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)\n\n            key = self.reshape_heads_to_batch_dim(key)\n            value = self.reshape_heads_to_batch_dim(value)\n            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)\n            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)\n\n            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)\n            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)\n        else:\n            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states\n            key = self.to_k(encoder_hidden_states)\n            value = self.to_v(encoder_hidden_states)\n            \n            if not self.use_relative_position:\n                key = self.reshape_heads_to_batch_dim(key)\n                value = self.reshape_heads_to_batch_dim(value)\n\n        if attention_mask is not None:\n            if attention_mask.shape[-1] != query.shape[1]:\n                target_length = query.shape[1]\n                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)\n                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)\n\n        # attention, what we cannot get enough of\n        if self._use_memory_efficient_attention_xformers:\n            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)\n            # Some versions of xformers return output in fp32, cast it back to the dtype of the input\n            hidden_states = hidden_states.to(query.dtype)\n        else:\n            if self._slice_size is None or query.shape[0] // self._slice_size == 1:\n                hidden_states = self._attention(query, key, value, attention_mask)\n            else:\n                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)\n\n        # linear proj\n        hidden_states = self.to_out[0](hidden_states)\n\n        # dropout\n        hidden_states = self.to_out[1](hidden_states)\n        return hidden_states\n    \n    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None, use_image_num=None):\n        if self.training:\n            # print(use_image_num)\n            hidden_states = rearrange(hidden_states, \"(b f) d c -> b f d c\", f=video_length + use_image_num).contiguous()\n            hidden_states_video = hidden_states[:, :video_length, ...]\n            hidden_states_image = hidden_states[:, video_length:, ...]\n            hidden_states_video = rearrange(hidden_states_video, 'b f d c -> (b f) d c').contiguous()\n            hidden_states_image = rearrange(hidden_states_image, 'b f d c -> (b f) d c').contiguous()\n            hidden_states_video = self.forward_video(hidden_states=hidden_states_video, \n                            encoder_hidden_states=encoder_hidden_states, \n                            attention_mask=attention_mask, \n                            video_length=video_length)\n            hidden_states_image = self.forward_image(hidden_states=hidden_states_image, \n                                                    encoder_hidden_states=encoder_hidden_states, \n                                                    attention_mask=attention_mask)\n            hidden_states = torch.cat([hidden_states_video, hidden_states_image], dim=0)\n            return hidden_states\n            # exit()\n        else:\n            return self.forward_video(hidden_states=hidden_states, \n                            encoder_hidden_states=encoder_hidden_states, \n                            attention_mask=attention_mask, \n                            video_length=video_length)\n\nclass TemporalAttention(CrossAttention):\n    def __init__(self, \n                query_dim: int,\n                cross_attention_dim: Optional[int] = None,\n                heads: int = 8,\n                dim_head: int = 64,\n                dropout: float = 0.0,\n                bias=False,\n                upcast_attention: bool = False,\n                upcast_softmax: bool = False,\n                added_kv_proj_dim: Optional[int] = None,\n                norm_num_groups: Optional[int] = None,\n                rotary_emb=None):\n        super().__init__(query_dim, cross_attention_dim, heads, dim_head, dropout, bias, upcast_attention, upcast_softmax, added_kv_proj_dim, norm_num_groups)\n        # relative time positional embeddings\n        self.time_rel_pos_bias = RelativePositionBias(heads=heads, max_distance=32) # realistically will not be able to generate that many frames of video... yet\n        self.rotary_emb = rotary_emb\n\n    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):\n        time_rel_pos_bias = self.time_rel_pos_bias(hidden_states.shape[1], device=hidden_states.device)\n        batch_size, sequence_length, _ = hidden_states.shape\n\n        encoder_hidden_states = encoder_hidden_states\n\n        if self.group_norm is not None:\n            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)\n\n        query = self.to_q(hidden_states) # [b (h w)] f (nd * d)\n        dim = query.shape[-1]\n        \n        if self.added_kv_proj_dim is not None:\n            key = self.to_k(hidden_states)\n            value = self.to_v(hidden_states)\n            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)\n            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)\n\n            key = self.reshape_heads_to_batch_dim(key)\n            value = self.reshape_heads_to_batch_dim(value)\n            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)\n            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)\n\n            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)\n            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)\n        else:\n            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states\n            key = self.to_k(encoder_hidden_states)\n            value = self.to_v(encoder_hidden_states)\n            \n        if attention_mask is not None:\n            if attention_mask.shape[-1] != query.shape[1]:\n                target_length = query.shape[1]\n                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)\n                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)\n\n        # attention, what we cannot get enough of\n        if self._use_memory_efficient_attention_xformers:\n            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)\n            # Some versions of xformers return output in fp32, cast it back to the dtype of the input\n            hidden_states = hidden_states.to(query.dtype)\n        else:\n            if self._slice_size is None or query.shape[0] // self._slice_size == 1:\n                hidden_states = self._attention(query, key, value, attention_mask, time_rel_pos_bias)\n            else:\n                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)\n\n        # linear proj\n        hidden_states = self.to_out[0](hidden_states)\n\n        # dropout\n        hidden_states = self.to_out[1](hidden_states)\n        return hidden_states\n\n\n    def _attention(self, query, key, value, attention_mask=None, time_rel_pos_bias=None):\n        if self.upcast_attention:\n            query = query.float()\n            key = key.float()\n\n        query = self.scale * rearrange(query, 'b f (h d) -> b h f d', h=self.heads) # d: dim_head; n: heads\n        key = rearrange(key, 'b f (h d) -> b h f d', h=self.heads) # d: dim_head; n: heads\n        value = rearrange(value, 'b f (h d) -> b h f d', h=self.heads) # d: dim_head; n: heads\n\n        # torch.baddbmm only accepte 3-D tensor\n        # https://runebook.dev/zh/docs/pytorch/generated/torch.baddbmm\n        # attention_scores = self.scale * torch.matmul(query, key.transpose(-1, -2))\n        if exists(self.rotary_emb):\n            query = self.rotary_emb.rotate_queries_or_keys(query)\n            key = self.rotary_emb.rotate_queries_or_keys(key)\n\n        attention_scores = torch.einsum('... h i d, ... h j d -> ... h i j', query, key)\n\n        attention_scores = attention_scores + time_rel_pos_bias\n\n        if attention_mask is not None:\n            # add attention mask\n            attention_scores = attention_scores + attention_mask\n\n        # vdm \n        attention_scores = attention_scores - attention_scores.amax(dim = -1, keepdim = True).detach()\n\n        attention_probs = nn.functional.softmax(attention_scores, dim=-1)\n        # print(attention_probs[0][0])\n\n        # cast back to the original dtype\n        attention_probs = attention_probs.to(value.dtype)\n\n        # compute attention output \n        hidden_states = torch.einsum('... h i j, ... h j d -> ... h i d', attention_probs, value)\n        hidden_states = rearrange(hidden_states, 'b h f d -> b f (h d)')\n        return hidden_states\n    \nclass RelativePositionBias(nn.Module):\n    def __init__(\n        self,\n        heads=8,\n        num_buckets=32,\n        max_distance=128,\n    ):\n        super().__init__()\n        self.num_buckets = num_buckets\n        self.max_distance = max_distance\n        self.relative_attention_bias = nn.Embedding(num_buckets, heads)\n\n    @staticmethod\n    def _relative_position_bucket(relative_position, num_buckets=32, max_distance=128):\n        ret = 0\n        n = -relative_position\n\n        num_buckets //= 2\n        ret += (n < 0).long() * num_buckets\n        n = torch.abs(n)\n\n        max_exact = num_buckets // 2\n        is_small = n < max_exact\n\n        val_if_large = max_exact + (\n            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)\n        ).long()\n        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))\n\n        ret += torch.where(is_small, n, val_if_large)\n        return ret\n\n    def forward(self, n, device):\n        q_pos = torch.arange(n, dtype = torch.long, device = device)\n        k_pos = torch.arange(n, dtype = torch.long, device = device)\n        rel_pos = rearrange(k_pos, 'j -> 1 j') - rearrange(q_pos, 'i -> i 1')\n        rp_bucket = self._relative_position_bucket(rel_pos, num_buckets = self.num_buckets, max_distance = self.max_distance)\n        values = self.relative_attention_bias(rp_bucket)\n        return rearrange(values, 'i j h -> h i j') # num_heads, num_frames, num_frames\n"
  },
  {
    "path": "models/clip.py",
    "content": "import numpy\r\nimport torch.nn as nn\r\nfrom transformers import CLIPTokenizer, CLIPTextModel\r\n\r\nimport transformers\r\ntransformers.logging.set_verbosity_error()\r\n\r\n\"\"\"\r\nWill encounter following warning:\r\n- This IS expected if you are initializing CLIPTextModel from the checkpoint of a model trained on another task\r\nor with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\r\n- This IS NOT expected if you are initializing CLIPTextModel from the checkpoint of a model \r\nthat you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\r\n\r\nhttps://github.com/CompVis/stable-diffusion/issues/97 \r\naccording to this issue, this warning is safe.\r\n\r\nThis is expected since the vision backbone of the CLIP model is not needed to run Stable Diffusion. \r\nYou can safely ignore the warning, it is not an error.\r\n\r\nThis clip usage is from U-ViT and same with Stable Diffusion.\r\n\"\"\"\r\n\r\nclass AbstractEncoder(nn.Module):\r\n    def __init__(self):\r\n        super().__init__()\r\n\r\n    def encode(self, *args, **kwargs):\r\n        raise NotImplementedError\r\n\r\n\r\nclass FrozenCLIPEmbedder(AbstractEncoder):\r\n    \"\"\"Uses the CLIP transformer encoder for text (from Hugging Face)\"\"\"\r\n    # def __init__(self, version=\"openai/clip-vit-huge-patch14\", device=\"cuda\", max_length=77):\r\n    def __init__(self, path, device=\"cuda\", max_length=77):\r\n        super().__init__()\r\n        self.tokenizer = CLIPTokenizer.from_pretrained(path, subfolder=\"tokenizer\")\r\n        self.transformer = CLIPTextModel.from_pretrained(path, subfolder='text_encoder')\r\n        self.device = device\r\n        self.max_length = max_length\r\n        self.freeze()\r\n\r\n    def freeze(self):\r\n        self.transformer = self.transformer.eval()\r\n        for param in self.parameters():\r\n            param.requires_grad = False\r\n\r\n    def forward(self, text):\r\n        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,\r\n                                        return_overflowing_tokens=False, padding=\"max_length\", return_tensors=\"pt\")\r\n        tokens = batch_encoding[\"input_ids\"].to(self.device)\r\n        outputs = self.transformer(input_ids=tokens)\r\n\r\n        z = outputs.last_hidden_state\r\n        return z\r\n\r\n    def encode(self, text):\r\n        return self(text)\r\n    \r\n\r\nclass TextEmbedder(nn.Module):\r\n    \"\"\"\r\n    Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance.\r\n    \"\"\"\r\n    def __init__(self, path, dropout_prob=0.1):\r\n        super().__init__()\r\n        self.text_encodder = FrozenCLIPEmbedder(path=path)\r\n        self.dropout_prob = dropout_prob\r\n    \r\n    def token_drop(self, text_prompts, force_drop_ids=None):\r\n        \"\"\"\r\n        Drops text to enable classifier-free guidance.\r\n        \"\"\"\r\n        if force_drop_ids is None:\r\n            drop_ids = numpy.random.uniform(0, 1, len(text_prompts)) < self.dropout_prob\r\n        else:\r\n            # TODO\r\n            drop_ids = force_drop_ids == 1\r\n        labels = list(numpy.where(drop_ids, \"\", text_prompts))\r\n        # print(labels)\r\n        return labels\r\n\r\n    def forward(self, text_prompts, train, force_drop_ids=None):\r\n        use_dropout = self.dropout_prob > 0\r\n        if (train and use_dropout) or (force_drop_ids is not None):\r\n            text_prompts = self.token_drop(text_prompts, force_drop_ids)\r\n        embeddings = self.text_encodder(text_prompts)\r\n        return embeddings\r\n    \r\n\r\nif __name__ == '__main__':\r\n\r\n    r\"\"\"\r\n    Returns:\r\n\r\n    Examples from CLIPTextModel:\r\n\r\n    ```python\r\n    >>> from transformers import AutoTokenizer, CLIPTextModel\r\n\r\n    >>> model = CLIPTextModel.from_pretrained(\"openai/clip-vit-base-patch32\")\r\n    >>> tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-base-patch32\")\r\n\r\n    >>> inputs = tokenizer([\"a photo of a cat\", \"a photo of a dog\"], padding=True, return_tensors=\"pt\")\r\n\r\n    >>> outputs = model(**inputs)\r\n    >>> last_hidden_state = outputs.last_hidden_state\r\n    >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states\r\n    ```\"\"\"\r\n\r\n    import torch\r\n\r\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\r\n\r\n    text_encoder = TextEmbedder(path='/mnt/petrelfs/maxin/work/pretrained/stable-diffusion-2-1-base',\r\n                                dropout_prob=0.00001).to(device)\r\n\r\n    text_prompt = [[\"a photo of a cat\", \"a photo of a cat\"], [\"a photo of a dog\", \"a photo of a cat\"], ['a photo of a dog human', \"a photo of a cat\"]]\r\n    # text_prompt = ('None', 'None', 'None')\r\n    output = text_encoder(text_prompts=text_prompt, train=False)\r\n    # print(output)\r\n    print(output.shape)\r\n    # print(output.shape)"
  },
  {
    "path": "models/resnet.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py\nimport os\nimport sys\nsys.path.append(os.path.split(sys.path[0])[0])\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom einops import rearrange\n\n\nclass InflatedConv3d(nn.Conv2d):\n    def forward(self, x):\n        video_length = x.shape[2]\n\n        x = rearrange(x, \"b c f h w -> (b f) c h w\")\n        x = super().forward(x)\n        x = rearrange(x, \"(b f) c h w -> b c f h w\", f=video_length)\n\n        return x\n\n\nclass Upsample3D(nn.Module):\n    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name=\"conv\"):\n        super().__init__()\n        self.channels = channels\n        self.out_channels = out_channels or channels\n        self.use_conv = use_conv\n        self.use_conv_transpose = use_conv_transpose\n        self.name = name\n\n        conv = None\n        if use_conv_transpose:\n            raise NotImplementedError\n        elif use_conv:\n            conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)\n\n        if name == \"conv\":\n            self.conv = conv\n        else:\n            self.Conv2d_0 = conv\n\n    def forward(self, hidden_states, output_size=None):\n        assert hidden_states.shape[1] == self.channels\n\n        if self.use_conv_transpose:\n            raise NotImplementedError\n\n        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16\n        dtype = hidden_states.dtype\n        if dtype == torch.bfloat16:\n            hidden_states = hidden_states.to(torch.float32)\n\n        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984\n        if hidden_states.shape[0] >= 64:\n            hidden_states = hidden_states.contiguous()\n\n        # if `output_size` is passed we force the interpolation output\n        # size and do not make use of `scale_factor=2`\n        if output_size is None:\n            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode=\"nearest\")\n        else:\n            hidden_states = F.interpolate(hidden_states, size=output_size, mode=\"nearest\")\n\n        # If the input is bfloat16, we cast back to bfloat16\n        if dtype == torch.bfloat16:\n            hidden_states = hidden_states.to(dtype)\n\n        if self.use_conv:\n            if self.name == \"conv\":\n                hidden_states = self.conv(hidden_states)\n            else:\n                hidden_states = self.Conv2d_0(hidden_states)\n\n        return hidden_states\n\n\nclass Downsample3D(nn.Module):\n    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name=\"conv\"):\n        super().__init__()\n        self.channels = channels\n        self.out_channels = out_channels or channels\n        self.use_conv = use_conv\n        self.padding = padding\n        stride = 2\n        self.name = name\n\n        if use_conv:\n            conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)\n        else:\n            raise NotImplementedError\n\n        if name == \"conv\":\n            self.Conv2d_0 = conv\n            self.conv = conv\n        elif name == \"Conv2d_0\":\n            self.conv = conv\n        else:\n            self.conv = conv\n\n    def forward(self, hidden_states):\n        assert hidden_states.shape[1] == self.channels\n        if self.use_conv and self.padding == 0:\n            raise NotImplementedError\n\n        assert hidden_states.shape[1] == self.channels\n        hidden_states = self.conv(hidden_states)\n\n        return hidden_states\n\n\nclass ResnetBlock3D(nn.Module):\n    def __init__(\n        self,\n        *,\n        in_channels,\n        out_channels=None,\n        conv_shortcut=False,\n        dropout=0.0,\n        temb_channels=512,\n        groups=32,\n        groups_out=None,\n        pre_norm=True,\n        eps=1e-6,\n        non_linearity=\"swish\",\n        time_embedding_norm=\"default\",\n        output_scale_factor=1.0,\n        use_in_shortcut=None,\n    ):\n        super().__init__()\n        self.pre_norm = pre_norm\n        self.pre_norm = True\n        self.in_channels = in_channels\n        out_channels = in_channels if out_channels is None else out_channels\n        self.out_channels = out_channels\n        self.use_conv_shortcut = conv_shortcut\n        self.time_embedding_norm = time_embedding_norm\n        self.output_scale_factor = output_scale_factor\n\n        if groups_out is None:\n            groups_out = groups\n\n        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)\n\n        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)\n\n        if temb_channels is not None:\n            if self.time_embedding_norm == \"default\":\n                time_emb_proj_out_channels = out_channels\n            elif self.time_embedding_norm == \"scale_shift\":\n                time_emb_proj_out_channels = out_channels * 2\n            else:\n                raise ValueError(f\"unknown time_embedding_norm : {self.time_embedding_norm} \")\n\n            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)\n        else:\n            self.time_emb_proj = None\n\n        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)\n        self.dropout = torch.nn.Dropout(dropout)\n        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)\n\n        if non_linearity == \"swish\":\n            self.nonlinearity = lambda x: F.silu(x)\n        elif non_linearity == \"mish\":\n            self.nonlinearity = Mish()\n        elif non_linearity == \"silu\":\n            self.nonlinearity = nn.SiLU()\n\n        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut\n\n        self.conv_shortcut = None\n        if self.use_in_shortcut:\n            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)\n\n    def forward(self, input_tensor, temb):\n        hidden_states = input_tensor\n\n        hidden_states = self.norm1(hidden_states)\n        hidden_states = self.nonlinearity(hidden_states)\n\n        hidden_states = self.conv1(hidden_states)\n\n        if temb is not None:\n            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]\n\n        if temb is not None and self.time_embedding_norm == \"default\":\n            hidden_states = hidden_states + temb\n\n        hidden_states = self.norm2(hidden_states)\n\n        if temb is not None and self.time_embedding_norm == \"scale_shift\":\n            scale, shift = torch.chunk(temb, 2, dim=1)\n            hidden_states = hidden_states * (1 + scale) + shift\n\n        hidden_states = self.nonlinearity(hidden_states)\n\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.conv2(hidden_states)\n\n        if self.conv_shortcut is not None:\n            input_tensor = self.conv_shortcut(input_tensor)\n\n        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor\n\n        return output_tensor\n\n\nclass Mish(torch.nn.Module):\n    def forward(self, hidden_states):\n        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))"
  },
  {
    "path": "models/unet.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py\n\nfrom dataclasses import dataclass\nfrom typing import List, Optional, Tuple, Union\n\nimport os\nimport sys\nsys.path.append(os.path.split(sys.path[0])[0])\n\nimport math\nimport json\nimport torch\nimport einops\nimport torch.nn as nn\nimport torch.utils.checkpoint\n\nfrom diffusers.configuration_utils import ConfigMixin, register_to_config\nfrom diffusers.utils import BaseOutput, logging\nfrom diffusers.models.embeddings import TimestepEmbedding, Timesteps\nfrom einops import rearrange\n\ntry:\n    from diffusers.models.modeling_utils import ModelMixin\nexcept:\n    from diffusers.modeling_utils import ModelMixin # 0.11.1\n\ntry:\n    from .unet_blocks import (\n        CrossAttnDownBlock3D,\n        CrossAttnUpBlock3D,\n        DownBlock3D,\n        UNetMidBlock3DCrossAttn,\n        UpBlock3D,\n        get_down_block,\n        get_up_block,\n    )\n    from .resnet import InflatedConv3d\nexcept:\n    from unet_blocks import (\n        CrossAttnDownBlock3D,\n        CrossAttnUpBlock3D,\n        DownBlock3D,\n        UNetMidBlock3DCrossAttn,\n        UpBlock3D,\n        get_down_block,\n        get_up_block,\n    )\n    from resnet import InflatedConv3d\n\nfrom rotary_embedding_torch import RotaryEmbedding\n\nlogger = logging.get_logger(__name__)  # pylint: disable=invalid-name\n\nclass RelativePositionBias(nn.Module):\n    def __init__(\n        self,\n        heads=8,\n        num_buckets=32,\n        max_distance=128,\n    ):\n        super().__init__()\n        self.num_buckets = num_buckets\n        self.max_distance = max_distance\n        self.relative_attention_bias = nn.Embedding(num_buckets, heads)\n\n    @staticmethod\n    def _relative_position_bucket(relative_position, num_buckets=32, max_distance=128):\n        ret = 0\n        n = -relative_position\n\n        num_buckets //= 2\n        ret += (n < 0).long() * num_buckets\n        n = torch.abs(n)\n\n        max_exact = num_buckets // 2\n        is_small = n < max_exact\n\n        val_if_large = max_exact + (\n            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)\n        ).long()\n        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))\n\n        ret += torch.where(is_small, n, val_if_large)\n        return ret\n\n    def forward(self, n, device):\n        q_pos = torch.arange(n, dtype = torch.long, device = device)\n        k_pos = torch.arange(n, dtype = torch.long, device = device)\n        rel_pos = einops.rearrange(k_pos, 'j -> 1 j') - einops.rearrange(q_pos, 'i -> i 1')\n        rp_bucket = self._relative_position_bucket(rel_pos, num_buckets = self.num_buckets, max_distance = self.max_distance)\n        values = self.relative_attention_bias(rp_bucket)\n        return einops.rearrange(values, 'i j h -> h i j') # num_heads, num_frames, num_frames\n\n@dataclass\nclass UNet3DConditionOutput(BaseOutput):\n    sample: torch.FloatTensor\n\n\nclass UNet3DConditionModel(ModelMixin, ConfigMixin):\n    _supports_gradient_checkpointing = True\n\n    @register_to_config\n    def __init__(\n        self,\n        sample_size: Optional[int] = None, # 64\n        in_channels: int = 4,\n        out_channels: int = 4,\n        center_input_sample: bool = False,\n        flip_sin_to_cos: bool = True,\n        freq_shift: int = 0,\n        down_block_types: Tuple[str] = (\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"DownBlock3D\",\n        ),\n        mid_block_type: str = \"UNetMidBlock3DCrossAttn\",\n        up_block_types: Tuple[str] = (\n            \"UpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\"\n        ),\n        only_cross_attention: Union[bool, Tuple[bool]] = False,\n        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),\n        layers_per_block: int = 2,\n        downsample_padding: int = 1,\n        mid_block_scale_factor: float = 1,\n        act_fn: str = \"silu\",\n        norm_num_groups: int = 32,\n        norm_eps: float = 1e-5,\n        cross_attention_dim: int = 1280,\n        attention_head_dim: Union[int, Tuple[int]] = 8,\n        dual_cross_attention: bool = False,\n        use_linear_projection: bool = False,\n        class_embed_type: Optional[str] = None,\n        num_class_embeds: Optional[int] = None,\n        upcast_attention: bool = False,\n        resnet_time_scale_shift: str = \"default\",\n        use_first_frame: bool = False,\n        use_relative_position: bool = False,\n    ):\n        super().__init__()\n\n        # print(use_first_frame)\n\n        self.sample_size = sample_size\n        time_embed_dim = block_out_channels[0] * 4\n\n        # input\n        self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))\n\n        # time\n        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)\n        timestep_input_dim = block_out_channels[0]\n\n        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)\n\n        # class embedding\n        if class_embed_type is None and num_class_embeds is not None:\n            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)\n        elif class_embed_type == \"timestep\":\n            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)\n        elif class_embed_type == \"identity\":\n            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)\n        else:\n            self.class_embedding = None\n\n        self.down_blocks = nn.ModuleList([])\n        self.mid_block = None\n        self.up_blocks = nn.ModuleList([])\n\n        if isinstance(only_cross_attention, bool):\n            only_cross_attention = [only_cross_attention] * len(down_block_types)\n\n        if isinstance(attention_head_dim, int):\n            attention_head_dim = (attention_head_dim,) * len(down_block_types)\n\n        rotary_emb = RotaryEmbedding(32)\n\n        # down\n        output_channel = block_out_channels[0]\n        for i, down_block_type in enumerate(down_block_types):\n            input_channel = output_channel\n            output_channel = block_out_channels[i]\n            is_final_block = i == len(block_out_channels) - 1\n\n            down_block = get_down_block(\n                down_block_type,\n                num_layers=layers_per_block,\n                in_channels=input_channel,\n                out_channels=output_channel,\n                temb_channels=time_embed_dim,\n                add_downsample=not is_final_block,\n                resnet_eps=norm_eps,\n                resnet_act_fn=act_fn,\n                resnet_groups=norm_num_groups,\n                cross_attention_dim=cross_attention_dim,\n                attn_num_head_channels=attention_head_dim[i],\n                downsample_padding=downsample_padding,\n                dual_cross_attention=dual_cross_attention,\n                use_linear_projection=use_linear_projection,\n                only_cross_attention=only_cross_attention[i],\n                upcast_attention=upcast_attention,\n                resnet_time_scale_shift=resnet_time_scale_shift,\n                use_first_frame=use_first_frame,\n                use_relative_position=use_relative_position,\n                rotary_emb=rotary_emb,\n            )\n            self.down_blocks.append(down_block)\n\n        # mid\n        if mid_block_type == \"UNetMidBlock3DCrossAttn\":\n            self.mid_block = UNetMidBlock3DCrossAttn(\n                in_channels=block_out_channels[-1],\n                temb_channels=time_embed_dim,\n                resnet_eps=norm_eps,\n                resnet_act_fn=act_fn,\n                output_scale_factor=mid_block_scale_factor,\n                resnet_time_scale_shift=resnet_time_scale_shift,\n                cross_attention_dim=cross_attention_dim,\n                attn_num_head_channels=attention_head_dim[-1],\n                resnet_groups=norm_num_groups,\n                dual_cross_attention=dual_cross_attention,\n                use_linear_projection=use_linear_projection,\n                upcast_attention=upcast_attention,\n                use_first_frame=use_first_frame,\n                use_relative_position=use_relative_position,\n                rotary_emb=rotary_emb,\n            )\n        else:\n            raise ValueError(f\"unknown mid_block_type : {mid_block_type}\")\n\n        # count how many layers upsample the videos\n        self.num_upsamplers = 0\n\n        # up\n        reversed_block_out_channels = list(reversed(block_out_channels))\n        reversed_attention_head_dim = list(reversed(attention_head_dim))\n        only_cross_attention = list(reversed(only_cross_attention))\n        output_channel = reversed_block_out_channels[0]\n        for i, up_block_type in enumerate(up_block_types):\n            is_final_block = i == len(block_out_channels) - 1\n\n            prev_output_channel = output_channel\n            output_channel = reversed_block_out_channels[i]\n            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]\n\n            # add upsample block for all BUT final layer\n            if not is_final_block:\n                add_upsample = True\n                self.num_upsamplers += 1\n            else:\n                add_upsample = False\n\n            up_block = get_up_block(\n                up_block_type,\n                num_layers=layers_per_block + 1,\n                in_channels=input_channel,\n                out_channels=output_channel,\n                prev_output_channel=prev_output_channel,\n                temb_channels=time_embed_dim,\n                add_upsample=add_upsample,\n                resnet_eps=norm_eps,\n                resnet_act_fn=act_fn,\n                resnet_groups=norm_num_groups,\n                cross_attention_dim=cross_attention_dim,\n                attn_num_head_channels=reversed_attention_head_dim[i],\n                dual_cross_attention=dual_cross_attention,\n                use_linear_projection=use_linear_projection,\n                only_cross_attention=only_cross_attention[i],\n                upcast_attention=upcast_attention,\n                resnet_time_scale_shift=resnet_time_scale_shift,\n                use_first_frame=use_first_frame,\n                use_relative_position=use_relative_position,\n                rotary_emb=rotary_emb,\n            )\n            self.up_blocks.append(up_block)\n            prev_output_channel = output_channel\n\n        # out\n        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)\n        self.conv_act = nn.SiLU()\n        self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)\n\n        # relative time positional embeddings\n        self.use_relative_position = use_relative_position\n        if self.use_relative_position:\n            self.time_rel_pos_bias = RelativePositionBias(heads=8, max_distance=32) # realistically will not be able to generate that many frames of video... yet\n\n    def set_attention_slice(self, slice_size):\n        r\"\"\"\n        Enable sliced attention computation.\n\n        When this option is enabled, the attention module will split the input tensor in slices, to compute attention\n        in several steps. This is useful to save some memory in exchange for a small speed decrease.\n\n        Args:\n            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `\"auto\"`):\n                When `\"auto\"`, halves the input to the attention heads, so attention will be computed in two steps. If\n                `\"max\"`, maxium amount of memory will be saved by running only one slice at a time. If a number is\n                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`\n                must be a multiple of `slice_size`.\n        \"\"\"\n        sliceable_head_dims = []\n\n        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):\n            if hasattr(module, \"set_attention_slice\"):\n                sliceable_head_dims.append(module.sliceable_head_dim)\n\n            for child in module.children():\n                fn_recursive_retrieve_slicable_dims(child)\n\n        # retrieve number of attention layers\n        for module in self.children():\n            fn_recursive_retrieve_slicable_dims(module)\n\n        num_slicable_layers = len(sliceable_head_dims)\n\n        if slice_size == \"auto\":\n            # half the attention head size is usually a good trade-off between\n            # speed and memory\n            slice_size = [dim // 2 for dim in sliceable_head_dims]\n        elif slice_size == \"max\":\n            # make smallest slice possible\n            slice_size = num_slicable_layers * [1]\n\n        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size\n\n        if len(slice_size) != len(sliceable_head_dims):\n            raise ValueError(\n                f\"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different\"\n                f\" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}.\"\n            )\n\n        for i in range(len(slice_size)):\n            size = slice_size[i]\n            dim = sliceable_head_dims[i]\n            if size is not None and size > dim:\n                raise ValueError(f\"size {size} has to be smaller or equal to {dim}.\")\n\n        # Recursively walk through all the children.\n        # Any children which exposes the set_attention_slice method\n        # gets the message\n        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):\n            if hasattr(module, \"set_attention_slice\"):\n                module.set_attention_slice(slice_size.pop())\n\n            for child in module.children():\n                fn_recursive_set_attention_slice(child, slice_size)\n\n        reversed_slice_size = list(reversed(slice_size))\n        for module in self.children():\n            fn_recursive_set_attention_slice(module, reversed_slice_size)\n\n    def _set_gradient_checkpointing(self, module, value=False):\n        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):\n            module.gradient_checkpointing = value\n\n    def forward(\n        self,\n        sample: torch.FloatTensor,\n        timestep: Union[torch.Tensor, float, int],\n        encoder_hidden_states: torch.Tensor = None,\n        class_labels: Optional[torch.Tensor] = None,\n        attention_mask: Optional[torch.Tensor] = None,\n        use_image_num: int = 0,\n        return_dict: bool = True,\n        ip_hidden_states = None,\n        encoder_temporal_hidden_states = None\n    ) -> Union[UNet3DConditionOutput, Tuple]:\n        r\"\"\"\n        Args:\n            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor\n            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps\n            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states\n            return_dict (`bool`, *optional*, defaults to `True`):\n                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.\n\n        Returns:\n            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:\n            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When\n            returning a tuple, the first element is the sample tensor.\n        \"\"\"\n        # By default samples have to be AT least a multiple of the overall upsampling factor.\n        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).\n        # However, the upsampling interpolation output size can be forced to fit any upsampling size\n        # on the fly if necessary.\n        if ip_hidden_states is not None:\n            b = ip_hidden_states.shape[0]\n            ip_hidden_states = rearrange(ip_hidden_states, 'b n c -> (b n) c')\n            ip_hidden_states = self.image_proj_model(ip_hidden_states)\n            ip_hidden_states = rearrange(ip_hidden_states, '(b n) m c -> b n m c', b=b)\n        default_overall_up_factor = 2**self.num_upsamplers\n\n        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`\n        forward_upsample_size = False\n        upsample_size = None\n\n        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):\n            logger.info(\"Forward upsample size to force interpolation output size.\")\n            forward_upsample_size = True\n\n        # prepare attention_mask\n        if attention_mask is not None:\n            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0\n            attention_mask = attention_mask.unsqueeze(1)\n\n        # center input if necessary\n        if self.config.center_input_sample:\n            sample = 2 * sample - 1.0\n\n        # time\n        timesteps = timestep\n        if not torch.is_tensor(timesteps):\n            # This would be a good case for the `match` statement (Python 3.10+)\n            is_mps = sample.device.type == \"mps\"\n            if isinstance(timestep, float):\n                dtype = torch.float32 if is_mps else torch.float64\n            else:\n                dtype = torch.int32 if is_mps else torch.int64\n            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)\n        elif len(timesteps.shape) == 0:\n            timesteps = timesteps[None].to(sample.device)\n\n        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML\n        timesteps = timesteps.expand(sample.shape[0])\n\n        t_emb = self.time_proj(timesteps)\n\n        # timesteps does not contain any weights and will always return f32 tensors\n        # but time_embedding might actually be running in fp16. so we need to cast here.\n        # there might be better ways to encapsulate this.\n        t_emb = t_emb.to(dtype=self.dtype)\n        emb = self.time_embedding(t_emb)\n\n        if self.class_embedding is not None:\n            if class_labels is None:\n                raise ValueError(\"class_labels should be provided when num_class_embeds > 0\")\n\n            if self.config.class_embed_type == \"timestep\":\n                class_labels = self.time_proj(class_labels)\n\n            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)\n            # print(emb.shape) # torch.Size([3, 1280])\n            # print(class_emb.shape) # torch.Size([3, 1280])\n            emb = emb + class_emb\n\n        if self.use_relative_position:\n            frame_rel_pos_bias = self.time_rel_pos_bias(sample.shape[2], device=sample.device)\n        else:\n            frame_rel_pos_bias = None\n\n        # pre-process\n        sample = self.conv_in(sample)\n\n        # down\n        down_block_res_samples = (sample,)\n        for downsample_block in self.down_blocks:\n            if hasattr(downsample_block, \"has_cross_attention\") and downsample_block.has_cross_attention:\n                sample, res_samples = downsample_block(\n                    hidden_states=sample,\n                    temb=emb,\n                    encoder_hidden_states=encoder_hidden_states,\n                    attention_mask=attention_mask,\n                    use_image_num=use_image_num,\n                    ip_hidden_states=ip_hidden_states,\n                    encoder_temporal_hidden_states=encoder_temporal_hidden_states\n                )\n            else:\n                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)\n\n            down_block_res_samples += res_samples\n\n        # mid\n        sample = self.mid_block(\n            sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states, encoder_temporal_hidden_states=encoder_temporal_hidden_states\n        )\n\n        # up\n        for i, upsample_block in enumerate(self.up_blocks):\n            is_final_block = i == len(self.up_blocks) - 1\n\n            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]\n            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]\n\n            # if we have not reached the final block and need to forward the\n            # upsample size, we do it here\n            if not is_final_block and forward_upsample_size:\n                upsample_size = down_block_res_samples[-1].shape[2:]\n\n            if hasattr(upsample_block, \"has_cross_attention\") and upsample_block.has_cross_attention:\n                sample = upsample_block(\n                    hidden_states=sample,\n                    temb=emb,\n                    res_hidden_states_tuple=res_samples,\n                    encoder_hidden_states=encoder_hidden_states,\n                    upsample_size=upsample_size,\n                    attention_mask=attention_mask,\n                    use_image_num=use_image_num,\n                    ip_hidden_states=ip_hidden_states,\n                    encoder_temporal_hidden_states=encoder_temporal_hidden_states\n                )\n            else:\n                sample = upsample_block(\n                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size\n                )\n        # post-process\n        sample = self.conv_norm_out(sample)\n        sample = self.conv_act(sample)\n        sample = self.conv_out(sample)\n        # print(sample.shape)\n\n        if not return_dict:\n            return (sample,)\n        sample = UNet3DConditionOutput(sample=sample)\n        return sample\n    \n    def forward_with_cfg(self, \n                        x, \n                        t, \n                        encoder_hidden_states = None,\n                        class_labels: Optional[torch.Tensor] = None,\n                        cfg_scale=4.0,\n                        use_fp16=False,\n                        ip_hidden_states = None):\n        \"\"\"\n        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.\n        \"\"\"\n        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb\n        half = x[: len(x) // 2]\n        combined = torch.cat([half, half], dim=0)\n        if use_fp16:\n            combined = combined.to(dtype=torch.float16)\n        model_out = self.forward(combined, t, encoder_hidden_states, class_labels, ip_hidden_states=ip_hidden_states).sample\n        # For exact reproducibility reasons, we apply classifier-free guidance on only\n        # three channels by default. The standard approach to cfg applies it to all channels.\n        # This can be done by uncommenting the following line and commenting-out the line following that.\n        eps, rest = model_out[:, :4], model_out[:, 4:]\n        # eps, rest = model_out[:, :3], model_out[:, 3:] # b c f h w\n        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)\n        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)\n        eps = torch.cat([half_eps, half_eps], dim=0)\n        return torch.cat([eps, rest], dim=1)\n\n    @classmethod\n    def from_pretrained_2d(cls, pretrained_model_path, subfolder=None, use_concat=False):\n        if subfolder is not None:\n            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)\n\n            \n        # the content of the config file\n        #         {\n        # \"_class_name\": \"UNet2DConditionModel\",\n        # \"_diffusers_version\": \"0.2.2\",\n        # \"act_fn\": \"silu\",\n        # \"attention_head_dim\": 8,\n        # \"block_out_channels\": [\n        #     320,\n        #     640,\n        #     1280,\n        #     1280\n        # ],\n        # \"center_input_sample\": false,\n        # \"cross_attention_dim\": 768,\n        # \"down_block_types\": [\n        #     \"CrossAttnDownBlock2D\",\n        #     \"CrossAttnDownBlock2D\",\n        #     \"CrossAttnDownBlock2D\",\n        #     \"DownBlock2D\"\n        # ],\n        # \"downsample_padding\": 1,\n        # \"flip_sin_to_cos\": true,\n        # \"freq_shift\": 0,\n        # \"in_channels\": 4,\n        # \"layers_per_block\": 2,\n        # \"mid_block_scale_factor\": 1,\n        # \"norm_eps\": 1e-05,\n        # \"norm_num_groups\": 32,\n        # \"out_channels\": 4,\n        # \"sample_size\": 64,\n        # \"up_block_types\": [\n        #     \"UpBlock2D\",\n        #     \"CrossAttnUpBlock2D\",\n        #     \"CrossAttnUpBlock2D\",\n        #     \"CrossAttnUpBlock2D\"\n        # ]\n        # }\n        config_file = os.path.join(pretrained_model_path, 'config.json')\n        if not os.path.isfile(config_file):\n            raise RuntimeError(f\"{config_file} does not exist\")\n        with open(config_file, \"r\") as f:\n            config = json.load(f)\n        config[\"_class_name\"] = cls.__name__\n        config[\"down_block_types\"] = [\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"CrossAttnDownBlock3D\",\n            \"DownBlock3D\"\n        ]\n        config[\"up_block_types\"] = [\n            \"UpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\",\n            \"CrossAttnUpBlock3D\"\n        ]\n\n        # config[\"use_first_frame\"] = True\n\n        config[\"use_first_frame\"] = False\n        if use_concat:\n            config[\"in_channels\"] = 9\n        # config[\"use_relative_position\"] = True\n\n        # # tmp\n        # config[\"class_embed_type\"] = \"timestep\"\n        # config[\"num_class_embeds\"] = 100\n\n        from diffusers.utils import WEIGHTS_NAME # diffusion_pytorch_model.bin\n        \n        # {'_class_name': 'UNet3DConditionModel', \n        #  '_diffusers_version': '0.2.2', \n        #  'act_fn': 'silu', \n        #  'attention_head_dim': 8, \n        #  'block_out_channels': [320, 640, 1280, 1280], \n        #  'center_input_sample': False, \n        #  'cross_attention_dim': 768, \n        #  'down_block_types': \n        #  ['CrossAttnDownBlock3D', \n        #   'CrossAttnDownBlock3D', \n        #   'CrossAttnDownBlock3D', \n        #   'DownBlock3D'], \n        #   'downsample_padding': 1, \n        #   'flip_sin_to_cos': True, \n        #   'freq_shift': 0, \n        #   'in_channels': 4, \n        #   'layers_per_block': 2, \n        #   'mid_block_scale_factor': 1, \n        #   'norm_eps': 1e-05, \n        #   'norm_num_groups': 32, \n        #   'out_channels': 4, \n        #   'sample_size': 64, \n        #   'up_block_types': \n        #   ['UpBlock3D', \n        #    'CrossAttnUpBlock3D', \n        #    'CrossAttnUpBlock3D', \n        #    'CrossAttnUpBlock3D']}\n\n        model = cls.from_config(config)\n        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)\n        if not os.path.isfile(model_file):\n            raise RuntimeError(f\"{model_file} does not exist\")\n        state_dict = torch.load(model_file, map_location=\"cpu\")\n\n        if use_concat:\n            new_state_dict = {}\n            conv_in_weight = state_dict[\"conv_in.weight\"]\n            new_conv_weight = torch.zeros((conv_in_weight.shape[0], 9, *conv_in_weight.shape[2:]), dtype=conv_in_weight.dtype)\n        \n            for i, j in zip([0, 1, 2, 3], [0, 1, 2, 3, 4, 5, 6, 7, 8]):\n                new_conv_weight[:, j] = conv_in_weight[:, i]\n            new_state_dict[\"conv_in.weight\"] = new_conv_weight\n            new_state_dict[\"conv_in.bias\"] = state_dict[\"conv_in.bias\"]\n            for k, v in model.state_dict().items():\n                # print(k)\n                if '_temp.' in k:\n                    new_state_dict.update({k: v})\n                if 'attn_fcross' in k: # conpy parms of attn1 to attn_fcross\n                    k = k.replace('attn_fcross', 'attn1')\n                    state_dict.update({k: state_dict[k]})\n                if 'norm_fcross' in k:\n                    k = k.replace('norm_fcross', 'norm1')\n                    state_dict.update({k: state_dict[k]})\n                    \n                if 'conv_in' in k:\n                    continue\n                else:\n                    new_state_dict[k] = v\n                # # tmp \n                # if 'class_embedding' in k:\n                #     state_dict.update({k: v})\n            # breakpoint()\n            model.load_state_dict(new_state_dict)\n        else:\n            for k, v in model.state_dict().items():\n                # print(k)\n                if '_temp' in k:\n                    state_dict.update({k: v})\n                if 'attn_fcross' in k: # conpy parms of attn1 to attn_fcross\n                    k = k.replace('attn_fcross', 'attn1')\n                    state_dict.update({k: state_dict[k]})\n                if 'norm_fcross' in k:\n                    k = k.replace('norm_fcross', 'norm1')\n                    state_dict.update({k: state_dict[k]})\n\n            model.load_state_dict(state_dict)\n\n        return model\n    "
  },
  {
    "path": "models/unet_blocks.py",
    "content": "# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py\nimport os\nimport sys\nsys.path.append(os.path.split(sys.path[0])[0])\n\nimport torch\nfrom torch import nn\n\ntry:\n    from .attention import Transformer3DModel\n    from .resnet import Downsample3D, ResnetBlock3D, Upsample3D\nexcept:\n    from attention import Transformer3DModel\n    from resnet import Downsample3D, ResnetBlock3D, Upsample3D\n\n\ndef get_down_block(\n    down_block_type,\n    num_layers,\n    in_channels,\n    out_channels,\n    temb_channels,\n    add_downsample,\n    resnet_eps,\n    resnet_act_fn,\n    attn_num_head_channels,\n    resnet_groups=None,\n    cross_attention_dim=None,\n    downsample_padding=None,\n    dual_cross_attention=False,\n    use_linear_projection=False,\n    only_cross_attention=False,\n    upcast_attention=False,\n    resnet_time_scale_shift=\"default\",\n    use_first_frame=False,\n    use_relative_position=False,\n    rotary_emb=False,\n):\n    # print(down_block_type)\n    # print(use_first_frame)\n    down_block_type = down_block_type[7:] if down_block_type.startswith(\"UNetRes\") else down_block_type\n    if down_block_type == \"DownBlock3D\":\n        return DownBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            temb_channels=temb_channels,\n            add_downsample=add_downsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            downsample_padding=downsample_padding,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n        )\n    elif down_block_type == \"CrossAttnDownBlock3D\":\n        if cross_attention_dim is None:\n            raise ValueError(\"cross_attention_dim must be specified for CrossAttnDownBlock3D\")\n        return CrossAttnDownBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            temb_channels=temb_channels,\n            add_downsample=add_downsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            downsample_padding=downsample_padding,\n            cross_attention_dim=cross_attention_dim,\n            attn_num_head_channels=attn_num_head_channels,\n            dual_cross_attention=dual_cross_attention,\n            use_linear_projection=use_linear_projection,\n            only_cross_attention=only_cross_attention,\n            upcast_attention=upcast_attention,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n            use_first_frame=use_first_frame,\n            use_relative_position=use_relative_position,\n            rotary_emb=rotary_emb,\n        )\n    raise ValueError(f\"{down_block_type} does not exist.\")\n\n\ndef get_up_block(\n    up_block_type,\n    num_layers,\n    in_channels,\n    out_channels,\n    prev_output_channel,\n    temb_channels,\n    add_upsample,\n    resnet_eps,\n    resnet_act_fn,\n    attn_num_head_channels,\n    resnet_groups=None,\n    cross_attention_dim=None,\n    dual_cross_attention=False,\n    use_linear_projection=False,\n    only_cross_attention=False,\n    upcast_attention=False,\n    resnet_time_scale_shift=\"default\",\n    use_first_frame=False,\n    use_relative_position=False,\n    rotary_emb=False,\n):\n    up_block_type = up_block_type[7:] if up_block_type.startswith(\"UNetRes\") else up_block_type\n    if up_block_type == \"UpBlock3D\":\n        return UpBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            prev_output_channel=prev_output_channel,\n            temb_channels=temb_channels,\n            add_upsample=add_upsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n        )\n    elif up_block_type == \"CrossAttnUpBlock3D\":\n        if cross_attention_dim is None:\n            raise ValueError(\"cross_attention_dim must be specified for CrossAttnUpBlock3D\")\n        return CrossAttnUpBlock3D(\n            num_layers=num_layers,\n            in_channels=in_channels,\n            out_channels=out_channels,\n            prev_output_channel=prev_output_channel,\n            temb_channels=temb_channels,\n            add_upsample=add_upsample,\n            resnet_eps=resnet_eps,\n            resnet_act_fn=resnet_act_fn,\n            resnet_groups=resnet_groups,\n            cross_attention_dim=cross_attention_dim,\n            attn_num_head_channels=attn_num_head_channels,\n            dual_cross_attention=dual_cross_attention,\n            use_linear_projection=use_linear_projection,\n            only_cross_attention=only_cross_attention,\n            upcast_attention=upcast_attention,\n            resnet_time_scale_shift=resnet_time_scale_shift,\n            use_first_frame=use_first_frame,\n            use_relative_position=use_relative_position,\n            rotary_emb=rotary_emb,\n        )\n    raise ValueError(f\"{up_block_type} does not exist.\")\n\n\nclass UNetMidBlock3DCrossAttn(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        attn_num_head_channels=1,\n        output_scale_factor=1.0,\n        cross_attention_dim=1280,\n        dual_cross_attention=False,\n        use_linear_projection=False,\n        upcast_attention=False,\n        use_first_frame=False,\n        use_relative_position=False,\n        rotary_emb=False,\n    ):\n        super().__init__()\n\n        self.has_cross_attention = True\n        self.attn_num_head_channels = attn_num_head_channels\n        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)\n\n        # there is always at least one resnet\n        resnets = [\n            ResnetBlock3D(\n                in_channels=in_channels,\n                out_channels=in_channels,\n                temb_channels=temb_channels,\n                eps=resnet_eps,\n                groups=resnet_groups,\n                dropout=dropout,\n                time_embedding_norm=resnet_time_scale_shift,\n                non_linearity=resnet_act_fn,\n                output_scale_factor=output_scale_factor,\n                pre_norm=resnet_pre_norm,\n            )\n        ]\n        attentions = []\n\n        for _ in range(num_layers):\n            if dual_cross_attention:\n                raise NotImplementedError\n            attentions.append(\n                Transformer3DModel(\n                    attn_num_head_channels,\n                    in_channels // attn_num_head_channels,\n                    in_channels=in_channels,\n                    num_layers=1,\n                    cross_attention_dim=cross_attention_dim,\n                    norm_num_groups=resnet_groups,\n                    use_linear_projection=use_linear_projection,\n                    upcast_attention=upcast_attention,\n                    use_first_frame=use_first_frame,\n                    use_relative_position=use_relative_position,\n                    rotary_emb=rotary_emb,\n                )\n            )\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=in_channels,\n                    out_channels=in_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n                )\n            )\n\n        self.attentions = nn.ModuleList(attentions)\n        self.resnets = nn.ModuleList(resnets)\n\n    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, use_image_num=None, ip_hidden_states=None, encoder_temporal_hidden_states=None):\n        hidden_states = self.resnets[0](hidden_states, temb)\n        for attn, resnet in zip(self.attentions, self.resnets[1:]):\n            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states, encoder_temporal_hidden_states=encoder_temporal_hidden_states).sample\n            hidden_states = resnet(hidden_states, temb)\n\n        return hidden_states\n\n\nclass CrossAttnDownBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        out_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        attn_num_head_channels=1,\n        cross_attention_dim=1280,\n        output_scale_factor=1.0,\n        downsample_padding=1,\n        add_downsample=True,\n        dual_cross_attention=False,\n        use_linear_projection=False,\n        only_cross_attention=False,\n        upcast_attention=False,\n        use_first_frame=False,\n        use_relative_position=False,\n        rotary_emb=False,\n    ):\n        super().__init__()\n        resnets = []\n        attentions = []\n\n        # print(use_first_frame)\n\n        self.has_cross_attention = True\n        self.attn_num_head_channels = attn_num_head_channels\n\n        for i in range(num_layers):\n            in_channels = in_channels if i == 0 else out_channels\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=in_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n                )\n            )\n            if dual_cross_attention:\n                raise NotImplementedError\n            attentions.append(\n                Transformer3DModel(\n                    attn_num_head_channels,\n                    out_channels // attn_num_head_channels,\n                    in_channels=out_channels,\n                    num_layers=1,\n                    cross_attention_dim=cross_attention_dim,\n                    norm_num_groups=resnet_groups,\n                    use_linear_projection=use_linear_projection,\n                    only_cross_attention=only_cross_attention,\n                    upcast_attention=upcast_attention,\n                    use_first_frame=use_first_frame,\n                    use_relative_position=use_relative_position,\n                    rotary_emb=rotary_emb,\n                )\n            )\n        self.attentions = nn.ModuleList(attentions)\n        self.resnets = nn.ModuleList(resnets)\n\n        if add_downsample:\n            self.downsamplers = nn.ModuleList(\n                [\n                    Downsample3D(\n                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name=\"op\"\n                    )\n                ]\n            )\n        else:\n            self.downsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, use_image_num=None, ip_hidden_states=None, encoder_temporal_hidden_states=None):\n        output_states = ()\n\n        for resnet, attn in zip(self.resnets, self.attentions):\n            if self.training and self.gradient_checkpointing:\n\n                def create_custom_forward(module, return_dict=None):\n                    def custom_forward(*inputs):\n                        if return_dict is not None:\n                            return module(*inputs, return_dict=return_dict)\n                        else:\n                            return module(*inputs)\n\n                    return custom_forward\n                \n                def create_custom_forward_attn(module, return_dict=None, use_image_num=None, ip_hidden_states=None):\n                    def custom_forward(*inputs):\n                        if return_dict is not None:\n                            return module(*inputs, return_dict=return_dict, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states)\n                        else:\n                            return module(*inputs, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n                hidden_states = torch.utils.checkpoint.checkpoint(\n                    create_custom_forward_attn(attn, return_dict=False, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states),\n                    hidden_states,\n                    encoder_hidden_states,\n                )[0]\n            else:\n                hidden_states = resnet(hidden_states, temb)\n                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states, encoder_temporal_hidden_states=encoder_temporal_hidden_states).sample\n\n            output_states += (hidden_states,)\n\n        if self.downsamplers is not None:\n            for downsampler in self.downsamplers:\n                hidden_states = downsampler(hidden_states)\n\n            output_states += (hidden_states,)\n\n        return hidden_states, output_states\n\n\nclass DownBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        out_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        output_scale_factor=1.0,\n        add_downsample=True,\n        downsample_padding=1,\n    ):\n        super().__init__()\n        resnets = []\n\n        for i in range(num_layers):\n            in_channels = in_channels if i == 0 else out_channels\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=in_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n                )\n            )\n\n        self.resnets = nn.ModuleList(resnets)\n\n        if add_downsample:\n            self.downsamplers = nn.ModuleList(\n                [\n                    Downsample3D(\n                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name=\"op\"\n                    )\n                ]\n            )\n        else:\n            self.downsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(self, hidden_states, temb=None):\n        output_states = ()\n\n        for resnet in self.resnets:\n            if self.training and self.gradient_checkpointing:\n\n                def create_custom_forward(module):\n                    def custom_forward(*inputs):\n                        return module(*inputs)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n            else:\n                hidden_states = resnet(hidden_states, temb)\n\n            output_states += (hidden_states,)\n\n        if self.downsamplers is not None:\n            for downsampler in self.downsamplers:\n                hidden_states = downsampler(hidden_states)\n\n            output_states += (hidden_states,)\n\n        return hidden_states, output_states\n\n\nclass CrossAttnUpBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        out_channels: int,\n        prev_output_channel: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        attn_num_head_channels=1,\n        cross_attention_dim=1280,\n        output_scale_factor=1.0,\n        add_upsample=True,\n        dual_cross_attention=False,\n        use_linear_projection=False,\n        only_cross_attention=False,\n        upcast_attention=False,\n        use_first_frame=False,\n        use_relative_position=False,\n        rotary_emb=False\n    ):\n        super().__init__()\n        resnets = []\n        attentions = []\n\n        self.has_cross_attention = True\n        self.attn_num_head_channels = attn_num_head_channels\n\n        for i in range(num_layers):\n            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels\n            resnet_in_channels = prev_output_channel if i == 0 else out_channels\n\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=resnet_in_channels + res_skip_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n                )\n            )\n            if dual_cross_attention:\n                raise NotImplementedError\n            attentions.append(\n                Transformer3DModel(\n                    attn_num_head_channels,\n                    out_channels // attn_num_head_channels,\n                    in_channels=out_channels,\n                    num_layers=1,\n                    cross_attention_dim=cross_attention_dim,\n                    norm_num_groups=resnet_groups,\n                    use_linear_projection=use_linear_projection,\n                    only_cross_attention=only_cross_attention,\n                    upcast_attention=upcast_attention,\n                    use_first_frame=use_first_frame,\n                    use_relative_position=use_relative_position,\n                    rotary_emb=rotary_emb,\n                )\n            )\n\n        self.attentions = nn.ModuleList(attentions)\n        self.resnets = nn.ModuleList(resnets)\n\n        if add_upsample:\n            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])\n        else:\n            self.upsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(\n        self,\n        hidden_states,\n        res_hidden_states_tuple,\n        temb=None,\n        encoder_hidden_states=None,\n        upsample_size=None,\n        attention_mask=None,\n        use_image_num=None,\n        ip_hidden_states=None,\n        encoder_temporal_hidden_states=None\n    ):\n        for resnet, attn in zip(self.resnets, self.attentions):\n            # pop res hidden states\n            res_hidden_states = res_hidden_states_tuple[-1]\n            res_hidden_states_tuple = res_hidden_states_tuple[:-1]\n            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)\n\n            if self.training and self.gradient_checkpointing:\n\n                def create_custom_forward(module, return_dict=None):\n                    def custom_forward(*inputs):\n                        if return_dict is not None:\n                            return module(*inputs, return_dict=return_dict)\n                        else:\n                            return module(*inputs)\n\n                    return custom_forward\n                \n                def create_custom_forward_attn(module, return_dict=None, use_image_num=None, ip_hidden_states=None):\n                    def custom_forward(*inputs):\n                        if return_dict is not None:\n                            return module(*inputs, return_dict=return_dict, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states)\n                        else:\n                            return module(*inputs, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n                hidden_states = torch.utils.checkpoint.checkpoint(\n                    create_custom_forward_attn(attn, return_dict=False, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states),\n                    hidden_states,\n                    encoder_hidden_states,\n                )[0]\n            else:\n                hidden_states = resnet(hidden_states, temb)\n                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states, use_image_num=use_image_num, ip_hidden_states=ip_hidden_states, encoder_temporal_hidden_states=encoder_temporal_hidden_states).sample\n\n        if self.upsamplers is not None:\n            for upsampler in self.upsamplers:\n                hidden_states = upsampler(hidden_states, upsample_size)\n\n        return hidden_states\n\n\nclass UpBlock3D(nn.Module):\n    def __init__(\n        self,\n        in_channels: int,\n        prev_output_channel: int,\n        out_channels: int,\n        temb_channels: int,\n        dropout: float = 0.0,\n        num_layers: int = 1,\n        resnet_eps: float = 1e-6,\n        resnet_time_scale_shift: str = \"default\",\n        resnet_act_fn: str = \"swish\",\n        resnet_groups: int = 32,\n        resnet_pre_norm: bool = True,\n        output_scale_factor=1.0,\n        add_upsample=True,\n    ):\n        super().__init__()\n        resnets = []\n\n        for i in range(num_layers):\n            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels\n            resnet_in_channels = prev_output_channel if i == 0 else out_channels\n\n            resnets.append(\n                ResnetBlock3D(\n                    in_channels=resnet_in_channels + res_skip_channels,\n                    out_channels=out_channels,\n                    temb_channels=temb_channels,\n                    eps=resnet_eps,\n                    groups=resnet_groups,\n                    dropout=dropout,\n                    time_embedding_norm=resnet_time_scale_shift,\n                    non_linearity=resnet_act_fn,\n                    output_scale_factor=output_scale_factor,\n                    pre_norm=resnet_pre_norm,\n                )\n            )\n\n        self.resnets = nn.ModuleList(resnets)\n\n        if add_upsample:\n            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])\n        else:\n            self.upsamplers = None\n\n        self.gradient_checkpointing = False\n\n    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):\n        for resnet in self.resnets:\n            # pop res hidden states\n            res_hidden_states = res_hidden_states_tuple[-1]\n            res_hidden_states_tuple = res_hidden_states_tuple[:-1]\n            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)\n\n            if self.training and self.gradient_checkpointing:\n\n                def create_custom_forward(module):\n                    def custom_forward(*inputs):\n                        return module(*inputs)\n\n                    return custom_forward\n\n                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)\n            else:\n                hidden_states = resnet(hidden_states, temb)\n\n        if self.upsamplers is not None:\n            for upsampler in self.upsamplers:\n                hidden_states = upsampler(hidden_states, upsample_size)\n\n        return hidden_states\n"
  },
  {
    "path": "models/utils.py",
    "content": "# adopted from\r\n# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py\r\n# and\r\n# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py\r\n# and\r\n# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py\r\n#\r\n# thanks!\r\n\r\n\r\nimport os\r\nimport math\r\nimport torch\r\n\r\nimport numpy as np\r\nimport torch.nn as nn\r\n\r\nfrom einops import repeat\r\n\r\n\r\n#################################################################################\r\n#                                  Unet Utils                                   #\r\n#################################################################################\r\n\r\ndef checkpoint(func, inputs, params, flag):\r\n    \"\"\"\r\n    Evaluate a function without caching intermediate activations, allowing for\r\n    reduced memory at the expense of extra compute in the backward pass.\r\n    :param func: the function to evaluate.\r\n    :param inputs: the argument sequence to pass to `func`.\r\n    :param params: a sequence of parameters `func` depends on but does not\r\n                   explicitly take as arguments.\r\n    :param flag: if False, disable gradient checkpointing.\r\n    \"\"\"\r\n    if flag:\r\n        args = tuple(inputs) + tuple(params)\r\n        return CheckpointFunction.apply(func, len(inputs), *args)\r\n    else:\r\n        return func(*inputs)\r\n\r\n\r\nclass CheckpointFunction(torch.autograd.Function):\r\n    @staticmethod\r\n    def forward(ctx, run_function, length, *args):\r\n        ctx.run_function = run_function\r\n        ctx.input_tensors = list(args[:length])\r\n        ctx.input_params = list(args[length:])\r\n\r\n        with torch.no_grad():\r\n            output_tensors = ctx.run_function(*ctx.input_tensors)\r\n        return output_tensors\r\n\r\n    @staticmethod\r\n    def backward(ctx, *output_grads):\r\n        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]\r\n        with torch.enable_grad():\r\n            # Fixes a bug where the first op in run_function modifies the\r\n            # Tensor storage in place, which is not allowed for detach()'d\r\n            # Tensors.\r\n            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]\r\n            output_tensors = ctx.run_function(*shallow_copies)\r\n        input_grads = torch.autograd.grad(\r\n            output_tensors,\r\n            ctx.input_tensors + ctx.input_params,\r\n            output_grads,\r\n            allow_unused=True,\r\n        )\r\n        del ctx.input_tensors\r\n        del ctx.input_params\r\n        del output_tensors\r\n        return (None, None) + input_grads\r\n\r\n\r\ndef timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):\r\n    \"\"\"\r\n    Create sinusoidal timestep embeddings.\r\n    :param timesteps: a 1-D Tensor of N indices, one per batch element.\r\n                      These may be fractional.\r\n    :param dim: the dimension of the output.\r\n    :param max_period: controls the minimum frequency of the embeddings.\r\n    :return: an [N x dim] Tensor of positional embeddings.\r\n    \"\"\"\r\n    if not repeat_only:\r\n        half = dim // 2\r\n        freqs = torch.exp(\r\n            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half\r\n        ).to(device=timesteps.device)\r\n        args = timesteps[:, None].float() * freqs[None]\r\n        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)\r\n        if dim % 2:\r\n            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)\r\n    else:\r\n        embedding = repeat(timesteps, 'b -> b d', d=dim).contiguous()\r\n    return embedding\r\n\r\n\r\ndef zero_module(module):\r\n    \"\"\"\r\n    Zero out the parameters of a module and return it.\r\n    \"\"\"\r\n    for p in module.parameters():\r\n        p.detach().zero_()\r\n    return module\r\n\r\n\r\ndef scale_module(module, scale):\r\n    \"\"\"\r\n    Scale the parameters of a module and return it.\r\n    \"\"\"\r\n    for p in module.parameters():\r\n        p.detach().mul_(scale)\r\n    return module\r\n\r\n\r\ndef mean_flat(tensor):\r\n    \"\"\"\r\n    Take the mean over all non-batch dimensions.\r\n    \"\"\"\r\n    return tensor.mean(dim=list(range(1, len(tensor.shape))))\r\n\r\n\r\ndef normalization(channels):\r\n    \"\"\"\r\n    Make a standard normalization layer.\r\n    :param channels: number of input channels.\r\n    :return: an nn.Module for normalization.\r\n    \"\"\"\r\n    return GroupNorm32(32, channels)\r\n\r\n\r\n# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.\r\nclass SiLU(nn.Module):\r\n    def forward(self, x):\r\n        return x * torch.sigmoid(x)\r\n\r\n\r\nclass GroupNorm32(nn.GroupNorm):\r\n    def forward(self, x):\r\n        return super().forward(x.float()).type(x.dtype)\r\n\r\ndef conv_nd(dims, *args, **kwargs):\r\n    \"\"\"\r\n    Create a 1D, 2D, or 3D convolution module.\r\n    \"\"\"\r\n    if dims == 1:\r\n        return nn.Conv1d(*args, **kwargs)\r\n    elif dims == 2:\r\n        return nn.Conv2d(*args, **kwargs)\r\n    elif dims == 3:\r\n        return nn.Conv3d(*args, **kwargs)\r\n    raise ValueError(f\"unsupported dimensions: {dims}\")\r\n\r\n\r\ndef linear(*args, **kwargs):\r\n    \"\"\"\r\n    Create a linear module.\r\n    \"\"\"\r\n    return nn.Linear(*args, **kwargs)\r\n\r\n\r\ndef avg_pool_nd(dims, *args, **kwargs):\r\n    \"\"\"\r\n    Create a 1D, 2D, or 3D average pooling module.\r\n    \"\"\"\r\n    if dims == 1:\r\n        return nn.AvgPool1d(*args, **kwargs)\r\n    elif dims == 2:\r\n        return nn.AvgPool2d(*args, **kwargs)\r\n    elif dims == 3:\r\n        return nn.AvgPool3d(*args, **kwargs)\r\n    raise ValueError(f\"unsupported dimensions: {dims}\")\r\n\r\n\r\n# class HybridConditioner(nn.Module):\r\n\r\n#     def __init__(self, c_concat_config, c_crossattn_config):\r\n#         super().__init__()\r\n#         self.concat_conditioner = instantiate_from_config(c_concat_config)\r\n#         self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)\r\n\r\n#     def forward(self, c_concat, c_crossattn):\r\n#         c_concat = self.concat_conditioner(c_concat)\r\n#         c_crossattn = self.crossattn_conditioner(c_crossattn)\r\n#         return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}\r\n\r\n\r\ndef noise_like(shape, device, repeat=False):\r\n    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))\r\n    noise = lambda: torch.randn(shape, device=device)\r\n    return repeat_noise() if repeat else noise()\r\n\r\ndef count_flops_attn(model, _x, y):\r\n    \"\"\"\r\n    A counter for the `thop` package to count the operations in an\r\n    attention operation.\r\n    Meant to be used like:\r\n        macs, params = thop.profile(\r\n            model,\r\n            inputs=(inputs, timestamps),\r\n            custom_ops={QKVAttention: QKVAttention.count_flops},\r\n        )\r\n    \"\"\"\r\n    b, c, *spatial = y[0].shape\r\n    num_spatial = int(np.prod(spatial))\r\n    # We perform two matmuls with the same number of ops.\r\n    # The first computes the weight matrix, the second computes\r\n    # the combination of the value vectors.\r\n    matmul_ops = 2 * b * (num_spatial ** 2) * c\r\n    model.total_ops += torch.DoubleTensor([matmul_ops])\r\n\r\ndef count_params(model, verbose=False):\r\n    total_params = sum(p.numel() for p in model.parameters())\r\n    if verbose:\r\n        print(f\"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.\")\r\n    return total_params"
  },
  {
    "path": "requirements.txt",
    "content": "bark_ssg==1.3.4\ndecord==0.6.0\ndiffusers==0.25.0\neinops==0.7.0\nimageio==2.28.0\nipython==8.14.0\nlibrosa==0.10.1\nmmcv==2.1.0\nmoviepy==1.0.3\nnatsort==8.3.1\nnltk==3.8.1\nnumpy==1.23.5\nomegaconf==2.3.0\nopenai==0.27.8\nopencv_python==4.7.0.72\nPillow==9.4.0\npytorch_lightning==2.0.2\nrotary_embedding_torch==0.2.3\nsoundfile==0.12.1\ntorch==2.0.0\ntorchvision==0.15.0\ntqdm==4.65.0\ntransformers==4.28.1\nxformers==0.0.19\n"
  },
  {
    "path": "results/vlog/teddy_travel/script/audio_prompts.txt",
    "content": "[\n {\n  \"video fragment id\": 1,\n  \"video fragment description\": \"Teddy is planning on paper.\"\n },\n {\n  \"video fragment id\": 2, \n  \"video fragment description\": \"He will travel around the world and explore every corner.\"\n }, \n {\n  \"video fragment id\": 3, \n  \"video fragment description\": \"Teddy came to the airport lobby, surrounded by people coming and going.\"\n }, \n {\n  \"video fragment id\": 4, \n  \"video fragment description\": \"Teddy's plane took off slowly from the airport and headed to his first destination..\"\n }, \n {\n  \"video fragment id\": 5, \n  \"video fragment description\": \"Teddy arrives in Paris.\"\n },\n {\n  \"video fragment id\": 6, \n  \"video fragment description\": \"Teddy came to the Eiffel Tower because of its reputation.\"\n },\n {\n  \"video fragment id\": 7, \n  \"video fragment description\": \"Teddy took out the picnic cloth and prepared food on the lawn and started enjoying lunch.\"\n },\n {\n  \"video fragment id\": 8, \n  \"video fragment description\": \"The scene turned and we came to the Great Wall of China.\"\n },\n {\n  \"video fragment id\": 9, \n  \"video fragment description\": \"Teddy was very excited and worked hard to climb the Great Wall.\"\n }, \n {\n  \"video fragment id\": 10, \n  \"video fragment description\": \"Finally reaching the top, Teddy looked at the scenery in the distance.\"\n },\n {\n  \"video fragment id\": 11, \n  \"video fragment description\": \"It's getting late and Teddy is leaving too.\"\n },\n {\n  \"video fragment id\": 12, \n  \"video fragment description\": \"Teddy is sitting on the bus, preparing to rush to the next destination.\"\n },\n {\n  \"video fragment id\": 13, \n  \"video fragment description\": \"The plane flies across the sky again.\"\n },\n {\n  \"video fragment id\": 14, \n  \"video fragment description\": \"This time Teddy came to the desert of Egypt.\"\n },\n {\n  \"video fragment id\": 15, \n  \"video fragment description\": \"Teddy was running wildly in the desert, and the pyramid suddenly appeared in front of him.\"\n },\n {\n  \"video fragment id\": 16, \n  \"video fragment description\": \"He hesitated for a moment before entering the pyramid.\"\n },\n {\n  \"video fragment id\": 17, \n  \"video fragment description\": \"When he enter the pyramid, he have to walk through a dark corridor.\"\n },\n {\n  \"video fragment id\": 18, \n  \"video fragment description\": \"While walking, he suddenly came to a bright room.\"\n },\n {\n  \"video fragment id\": 19, \n  \"video fragment description\": \"It turns out there is a hidden treasure waiting to be discovered by the destined person.\"\n },\n {\n  \"video fragment id\": 20, \n  \"video fragment description\": \"Teddy found his master with the treasure, and his master smiled broadly\"\n },\n {\n  \"video fragment id\": 21, \n  \"video fragment description\": \"The house is filled with things Teddy brought back.\"\n },\n {\n  \"video fragment id\": 22, \n  \"video fragment description\": \"Teddy is surrounded by things and still immersed in the joy of traveling.\"\n },\n {\n  \"video fragment id\": 23, \n  \"video fragment description\": \"But everything has an end, and this journey will eventually come to an end..\"\n },\n {\n  \"video fragment id\": 24, \n  \"video fragment description\": \"Teddy is sitting on the chair. Where will he go next time?\"\n }\n]"
  },
  {
    "path": "results/vlog/teddy_travel/script/protagonists_places.txt",
    "content": "[\n    {\n        \"id\": 1,\n        \"name\": \"Teddy\",\n        \"description\": \"A teddy bear with a dream of traveling the world\"\n    },\n    {\n        \"id\": 2,\n        \"name\": \"Eiffel Tower\",\n        \"description\": \"An iconic wrought-iron lattice tower located in Paris, France\"\n    },\n    {\n        \"id\": 3,\n        \"name\": \"Great Wall\",\n        \"description\": \"A vast, historic fortification system that stretches across the northern part of China\"\n    },\n    {\n        \"id\": 4,\n        \"name\": \"Pyramids\",\n        \"description\": \"Ancient monumental structures located in Egypt\"\n    }\n]"
  },
  {
    "path": "results/vlog/teddy_travel/script/time_scripts.txt",
    "content": "[\n{\n    \"video fragment id\": 1,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 2,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 3,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 4,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 5,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 6,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 7,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 8,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 9,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 10,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 11,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 12,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 13,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 14,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 15,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 16,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 17,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 18,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 19,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 20,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 21,\n    \"time\": 3\n},\n{\n    \"video fragment id\": 22,\n    \"time\": 2\n},\n{\n    \"video fragment id\": 23,\n    \"time\": 3\n}\n]"
  },
  {
    "path": "results/vlog/teddy_travel/script/video_prompts.txt",
    "content": "[\n{\n    \"video fragment id\": 1,\n    \"video fragment description\": \"Teddy bear in a kid's room.\"\n},\n{\n    \"video fragment id\": 2,\n    \"video fragment description\": \"Teddy bear is dreaming.\"\n},\n{\n    \"video fragment id\": 3,\n    \"video fragment description\": \"Dreams of travels.\"\n},\n{\n    \"video fragment id\": 4,\n    \"video fragment description\": \"Teddy in an airport.\"\n},\n{\n    \"video fragment id\": 5,\n    \"video fragment description\": \"Teddy peering from a backpack.\"\n},\n{\n    \"video fragment id\": 6,\n    \"video fragment description\": \"Teddy on a picnic blanket.\"\n},\n{\n    \"video fragment id\": 7,\n    \"video fragment description\": \"Eiffel Tower in the background.\"\n},\n{\n    \"video fragment id\": 8,\n    \"video fragment description\": \"Teddy enjoys Parisian picnic.\"\n},\n{\n    \"video fragment id\": 9,\n    \"video fragment description\": \"Croissants around Teddy.\"\n},\n{\n    \"video fragment id\": 10,\n    \"video fragment description\": \"Teddy atop the Great Wall.\"\n},\n{\n    \"video fragment id\": 11,\n    \"video fragment description\": \"Teddy admiring the view.\"\n},\n{\n    \"video fragment id\": 12,\n    \"video fragment description\": \"Teddy exploring pyramids in Egypt.\"\n},\n{\n    \"video fragment id\": 13,\n    \"video fragment description\": \"Under the hot Egyptian sun.\"\n},\n{\n    \"video fragment id\": 14,\n    \"video fragment description\": \"Teddy finds a treasure chest.\"\n},\n{\n    \"video fragment id\": 15,\n    \"video fragment description\": \"Treasure chest inside a pyramid.\"\n},\n{\n    \"video fragment id\": 16,\n    \"video fragment description\": \"Teddy back in the bedroom.\"\n},\n{\n    \"video fragment id\": 17,\n    \"video fragment description\": \"Sharing travel tales.\"\n},\n{\n    \"video fragment id\": 18,\n    \"video fragment description\": \"A little girl reacts.\"\n},\n{\n    \"video fragment id\": 19,\n    \"video fragment description\": \"Amazed by Teddy's stories.\"\n},\n{\n    \"video fragment id\": 20,\n    \"video fragment description\": \"Room filled with souvenirs.\"\n},\n{\n    \"video fragment id\": 21,\n    \"video fragment description\": \"Souvenirs from Teddy's trip.\"\n},\n{\n    \"video fragment id\": 22,\n    \"video fragment description\": \"Teddy gazing at a world map.\"\n},\n{\n    \"video fragment id\": 23,\n    \"video fragment description\": \"Dreaming of the next adventure.\"\n}\n]\n"
  },
  {
    "path": "results/vlog/teddy_travel/script/zh_video_prompts.txt",
    "content": "[\n{\n                \"序号\": 1,\n                \"描述\": \"泰迪熊在孩子的房间里。\",\n            },\n            {\n                \"序号\": 2,\n                \"描述\": \"泰迪熊正在做梦。\",\n            },\n            {\n                \"序号\": 3,\n                \"描述\": \"梦想着旅行。\",\n            },\n            {\n                \"序号\": 4,\n                \"描述\": \"泰迪熊在机场。\",\n            },\n            {\n                \"序号\": 5,\n                \"描述\": \"泰迪熊从背包中探出头来。\",\n            },\n            {\n                \"序号\": 6,\n                \"描述\": \"泰迪熊在野餐毯上。\",\n            },\n            {\n                \"序号\": 7,\n                \"描述\": \"背景是埃菲尔铁塔。\",\n            },\n            {\n                \"序号\": 8,\n                \"描述\": \"泰迪熊正在享受巴黎野餐。\",\n            },\n            {\n                \"序号\": 9,\n                \"描述\": \"泰迪熊周围是羊角面包。\",\n            },\n            {\n                \"序号\": 10,\n                \"描述\": \"泰迪熊在长城顶部。\",\n            },\n            {\n                \"序号\": 11,\n                \"描述\": \"泰迪熊正在欣赏风景。\",\n            },\n            {\n                \"序号\": 12,\n                \"描述\": \"泰迪熊在埃及探索金字塔。\",\n            },\n            {\n                \"序号\": 13,\n                \"描述\": \"炎热的埃及阳光下。\",\n            },\n            {\n                \"序号\": 14,\n                \"描述\": \"泰迪熊找到了一个宝箱。\",\n            },\n            {\n                \"序号\": 15,\n                \"描述\": \"宝箱在金字塔内部。\",\n            },\n            {\n                \"序号\": 16,\n                \"描述\": \"泰迪熊回到卧室。\",\n            },\n            {\n                \"序号\": 17,\n                \"描述\": \"分享旅行故事。\",\n            },\n            {\n                \"序号\": 18,\n                \"描述\": \"一个小女孩在反应。\",\n            },\n            {\n                \"序号\": 19,\n                \"描述\": \"惊讶于泰迪熊的故事。\",\n            },\n            {\n                \"序号\": 20,\n                \"描述\": \"房间里满是纪念品。\",\n            },\n            {\n                \"序号\": 21,\n                \"描述\": \"来自泰迪熊旅行的纪念品。\",\n            },\n            {\n                \"序号\": 22,\n                \"描述\": \"泰迪熊正在看世界地图。\",\n            },\n            {\n                \"序号\": 23,\n                \"描述\": \"梦想着下一次的冒险。\",\n            }\n            \n]"
  },
  {
    "path": "results/vlog/teddy_travel/story.txt",
    "content": "Once upon a time, there was a teddy bear named Teddy who dreamed of traveling the world. One day, his dream came true to travel around the world. Teddy sat in the airport lobby and traveled to many places of interest. Along the way, Teddy visited the Eiffel Tower, the Great Wall, and the pyramids. In Paris, Teddy had a picnic and enjoyed some delicious croissants. At the Great Wall of China, he climbed to the top and marveled at the breathtaking view. And in Egypt, he explored the pyramids and even found a secret treasure hidden inside. After his exciting journey, Teddy was eventually reunited with his owner who was thrilled to hear about all of Teddy’s adventures. From that day on, Teddy always dreamed of traveling the world again and experiencing new and exciting things."
  },
  {
    "path": "results/vlog/teddy_travel_/story.txt",
    "content": "Once upon a time, there was a teddy bear named Teddy who dreamed of traveling the world. One day, his dream came true to travel around the world. Teddy sat in the airport lobby and traveled to many places of interest. Along the way, Teddy visited the Eiffel Tower, the Great Wall, and the pyramids. In Paris, Teddy had a picnic and enjoyed some delicious croissants. At the Great Wall of China, he climbed to the top and marveled at the breathtaking view. And in Egypt, he explored the pyramids and even found a secret treasure hidden inside. After his exciting journey, Teddy was eventually reunited with his owner who was thrilled to hear about all of Teddy’s adventures. From that day on, Teddy always dreamed of traveling the world again and experiencing new and exciting things."
  },
  {
    "path": "sample_scripts/vlog_read_script_sample.py",
    "content": "import torch\n\ntorch.backends.cuda.matmul.allow_tf32 = True\ntorch.backends.cudnn.allow_tf32 = True\nimport os\nimport sys\ntry:\n    import utils\n    from diffusion import create_diffusion\nexcept:\n    sys.path.append(os.path.split(sys.path[0])[0])\n    import utils\n    from diffusion import create_diffusion\nimport argparse\nimport torchvision\nfrom PIL import Image\nfrom einops import rearrange\nfrom models import get_models\nfrom diffusers.models import AutoencoderKL\nfrom models.clip import TextEmbedder\nfrom omegaconf import OmegaConf\nfrom pytorch_lightning import seed_everything\nfrom utils import mask_generation_before\nfrom diffusers.utils.import_utils import is_xformers_available\nfrom transformers import CLIPVisionModelWithProjection, CLIPImageProcessor\nfrom vlogger.videofusion import fusion\nfrom vlogger.videocaption import captioning\nfrom vlogger.videoaudio import make_audio, merge_video_audio, concatenate_videos\nfrom vlogger.STEB.model_transform import ip_scale_set, ip_transform_model, tca_transform_model\nfrom vlogger.planning_utils.gpt4_utils import (readscript, \n                                               readtimescript, \n                                               readprotagonistscript, \n                                               readreferencescript, \n                                               readzhscript)\n\n\ndef auto_inpainting(args, \n                    video_input, \n                    masked_video, \n                    mask, \n                    prompt, \n                    image, \n                    vae, \n                    text_encoder, \n                    image_encoder, \n                    diffusion, \n                    model, \n                    device,\n                    ):\n    image_prompt_embeds = None\n    if prompt is None:\n        prompt = \"\"\n    if image is not None:\n        clip_image = CLIPImageProcessor()(images=image, return_tensors=\"pt\").pixel_values\n        clip_image_embeds = image_encoder(clip_image.to(device)).image_embeds\n        uncond_clip_image_embeds = torch.zeros_like(clip_image_embeds).to(device)\n        image_prompt_embeds = torch.cat([clip_image_embeds, uncond_clip_image_embeds], dim=0)\n        image_prompt_embeds = rearrange(image_prompt_embeds, '(b n) c -> b n c', b=2).contiguous()\n        model = ip_scale_set(model, args.ref_cfg_scale)\n        if args.use_fp16:\n            image_prompt_embeds = image_prompt_embeds.to(dtype=torch.float16)\n    b, f, c, h, w = video_input.shape\n    latent_h = video_input.shape[-2] // 8\n    latent_w = video_input.shape[-1] // 8\n\n    if args.use_fp16:\n        z = torch.randn(1, 4, 16, latent_h, latent_w, dtype=torch.float16, device=device) # b,c,f,h,w\n        masked_video = masked_video.to(dtype=torch.float16)\n        mask = mask.to(dtype=torch.float16)\n    else:\n        z = torch.randn(1, 4, 16, latent_h, latent_w, device=device) # b,c,f,h,w\n\n    masked_video = rearrange(masked_video, 'b f c h w -> (b f) c h w').contiguous()\n    masked_video = vae.encode(masked_video).latent_dist.sample().mul_(0.18215)\n    masked_video = rearrange(masked_video, '(b f) c h w -> b c f h w', b=b).contiguous()\n    mask = torch.nn.functional.interpolate(mask[:,:,0,:], size=(latent_h, latent_w)).unsqueeze(1)\n    masked_video = torch.cat([masked_video] * 2)\n    mask = torch.cat([mask] * 2)\n    z = torch.cat([z] * 2)\n    prompt_all = [prompt] + [args.negative_prompt]\n\n    text_prompt = text_encoder(text_prompts=prompt_all, train=False)\n    model_kwargs = dict(encoder_hidden_states=text_prompt, \n                        class_labels=None, \n                        cfg_scale=args.cfg_scale,\n                        use_fp16=args.use_fp16,\n                        ip_hidden_states=image_prompt_embeds)\n    \n    # Sample images:\n    samples = diffusion.ddim_sample_loop(model.forward_with_cfg, \n                                         z.shape, \n                                         z, \n                                         clip_denoised=False, \n                                         model_kwargs=model_kwargs, \n                                         progress=True, \n                                         device=device,\n                                         mask=mask, \n                                         x_start=masked_video, \n                                         use_concat=True,\n                                         )\n    samples, _ = samples.chunk(2, dim=0) # [1, 4, 16, 32, 32]\n    if args.use_fp16:\n        samples = samples.to(dtype=torch.float16)\n\n    video_clip = samples[0].permute(1, 0, 2, 3).contiguous() # [16, 4, 32, 32]\n    video_clip = vae.decode(video_clip / 0.18215).sample # [16, 3, 256, 256]\n    return video_clip\n\n\ndef main(args):\n    # Setup PyTorch:\n    if args.seed:\n        torch.manual_seed(args.seed)\n    torch.set_grad_enabled(False)\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n    seed_everything(args.seed)\n\n    model = get_models(args).to(device)\n    model = tca_transform_model(model).to(device)\n    model = ip_transform_model(model).to(device)\n    if args.enable_xformers_memory_efficient_attention:\n        if is_xformers_available():\n            model.enable_xformers_memory_efficient_attention()\n        else:\n            raise ValueError(\"xformers is not available. Make sure it is installed correctly\")\n    if args.use_compile:\n        model = torch.compile(model)\n\n    ckpt_path = args.ckpt \n    state_dict = torch.load(ckpt_path, map_location=lambda storage, loc: storage)['ema']\n    model_dict = model.state_dict()\n    pretrained_dict = {}\n    for k, v in state_dict.items():\n        if k in model_dict:\n            pretrained_dict[k] = v\n    model_dict.update(pretrained_dict)\n    model.load_state_dict(model_dict)\n\n    model.eval()  # important!\n    diffusion = create_diffusion(str(args.num_sampling_steps))\n    vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder=\"vae\").to(device)\n    text_encoder = text_encoder = TextEmbedder(args.pretrained_model_path).to(device)\n    image_encoder = CLIPVisionModelWithProjection.from_pretrained(args.image_encoder_path).to(device)\n    if args.use_fp16:\n        print('Warnning: using half percision for inferencing!')\n        vae.to(dtype=torch.float16)\n        model.to(dtype=torch.float16)\n        text_encoder.to(dtype=torch.float16)\n    print(\"model ready!\\n\", flush=True)\n    \n    \n    # load protagonist script\n    character_places = readprotagonistscript(args.protagonist_file_path)\n    print(\"protagonists ready!\", flush=True)\n\n    # load script\n    video_list = readscript(args.script_file_path)\n    print(\"video script ready!\", flush=True)\n    \n    # load reference script\n    reference_lists = readreferencescript(video_list, character_places, args.reference_file_path)\n    print(\"reference script ready!\", flush=True)\n    \n    # load zh script\n    zh_video_list = readzhscript(args.zh_script_file_path)\n    print(\"zh script ready!\", flush=True)\n    \n    # load time script\n    key_list = []\n    for key, value in character_places.items():\n        key_list.append(key)\n    time_list = readtimescript(args.time_file_path)\n    print(\"time script ready!\", flush=True)\n    \n\n    # generation begin\n    sample_list = []\n    for i, text_prompt in enumerate(video_list):\n        sample_list.append([])\n        for time in range(time_list[i]):\n            if time == 0:\n                print('Generating the ({}) prompt'.format(text_prompt), flush=True)\n                if reference_lists[i][0] == 0 or reference_lists[i][0] > len(key_list):\n                    pil_image = None\n                else:\n                    pil_image = Image.open(args.reference_image_path[reference_lists[i][0] - 1])\n                    pil_image.resize((256, 256))\n                video_input = torch.zeros([1, 16, 3, args.image_size[0], args.image_size[1]]).to(device)\n                mask = mask_generation_before(\"first0\", video_input.shape, video_input.dtype, device) # b,f,c,h,w\n                masked_video = video_input * (mask == 0)\n                samples = auto_inpainting(args, \n                                          video_input, \n                                          masked_video, \n                                          mask, \n                                          text_prompt, \n                                          pil_image, \n                                          vae, \n                                          text_encoder, \n                                          image_encoder, \n                                          diffusion, \n                                          model, \n                                          device,\n                                          )\n                sample_list[i].append(samples)\n            else:\n                if sum(video.shape[0] for video in sample_list[i]) / args.fps >= time_list[i]:\n                    break\n                print('Generating the ({}) prompt'.format(text_prompt), flush=True)\n                if reference_lists[i][0] == 0 or reference_lists[i][0] > len(key_list):\n                    pil_image = None\n                else:\n                    pil_image = Image.open(args.reference_image_path[reference_lists[i][0] - 1])\n                    pil_image.resize((256, 256))\n                pre_video = sample_list[i][-1][-args.researve_frame:]\n                f, c, h, w = pre_video.shape\n                lat_video = torch.zeros(args.num_frames - args.researve_frame, c, h, w).to(device)\n                video_input = torch.concat([pre_video, lat_video], dim=0)\n                video_input = video_input.to(device).unsqueeze(0)\n                mask = mask_generation_before(args.mask_type, video_input.shape, video_input.dtype, device)\n                masked_video = video_input * (mask == 0)\n                video_clip = auto_inpainting(args, \n                                             video_input, \n                                             masked_video, \n                                             mask, \n                                             text_prompt, \n                                             pil_image, \n                                             vae, \n                                             text_encoder, \n                                             image_encoder, \n                                             diffusion, \n                                             model, \n                                             device,\n                                             )\n                sample_list[i].append(video_clip[args.researve_frame:])\n                print(video_clip[args.researve_frame:].shape)\n\n        # transition\n        if args.video_transition and i != 0:\n            video_1 = sample_list[i - 1][-1][-1:]\n            video_2 = sample_list[i][0][:1]\n            f, c, h, w = video_1.shape\n            video_middle = torch.zeros(args.num_frames - 2, c, h, w).to(device)\n            video_input = torch.concat([video_1, video_middle, video_2], dim=0)\n            video_input = video_input.to(device).unsqueeze(0)\n            mask = mask_generation_before(\"onelast1\", video_input.shape, video_input.dtype, device)\n            masked_video = masked_video = video_input * (mask == 0)\n            video_clip = auto_inpainting(args, \n                                         video_input, \n                                         masked_video, \n                                         mask, \n                                         \"smooth transition, slow motion, slow changing.\", \n                                         pil_image, \n                                         vae, \n                                         text_encoder, \n                                         image_encoder, \n                                         diffusion, \n                                         model, \n                                         device,\n                                         )\n            sample_list[i].insert(0, video_clip[1:-1])\n\n        # save videos\n        samples = torch.concat(sample_list[i], dim=0)\n        samples = samples[0: time_list[i] * args.fps]\n        if not os.path.exists(args.save_origin_video_path):\n            os.makedirs(args.save_origin_video_path)\n        video_ = ((samples * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1).contiguous()\n        torchvision.io.write_video(args.save_origin_video_path + \"/\" + f\"{i}\" + '.mp4', video_, fps=args.fps)\n    \n    # post processing\n    fusion(args.save_origin_video_path)\n    captioning(args.script_file_path, args.zh_script_file_path, args.save_origin_video_path, args.save_caption_video_path)\n    fusion(args.save_caption_video_path)\n    make_audio(args.script_file_path, args.save_audio_path)\n    merge_video_audio(args.save_caption_video_path, args.save_audio_path, args.save_audio_caption_video_path)\n    concatenate_videos(args.save_audio_caption_video_path)\n    print('final video save path {}'.format(args.save_audio_caption_video_path))\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--config\", type=str, default=\"configs/vlog_read_script_sample.yaml\")\n    args = parser.parse_args()\n    omega_conf = OmegaConf.load(args.config)\n    save_path = omega_conf.save_path\n    save_origin_video_path = os.path.join(save_path, \"origin_video\")\n    save_caption_video_path = os.path.join(save_path.rsplit('/', 1)[0], \"caption_video\")\n    save_audio_path = os.path.join(save_path.rsplit('/', 1)[0], \"audio\")\n    save_audio_caption_video_path = os.path.join(save_path.rsplit('/', 1)[0], \"audio_caption_video\")\n    if omega_conf.sample_num is not None:\n        for i in range(omega_conf.sample_num):\n            omega_conf.save_origin_video_path = save_origin_video_path + f'-{i}'\n            omega_conf.save_caption_video_path = save_caption_video_path + f'-{i}'\n            omega_conf.save_audio_path = save_audio_path + f'-{i}'\n            omega_conf.save_audio_caption_video_path = save_audio_caption_video_path + f'-{i}'\n            omega_conf.seed += i\n            main(omega_conf)\n    else:\n        omega_conf.save_origin_video_path = save_origin_video_path\n        omega_conf.save_caption_video_path = save_caption_video_path\n        omega_conf.save_audio_path = save_audio_path\n        omega_conf.save_audio_caption_video_path = save_audio_caption_video_path\n        main(omega_conf)\n"
  },
  {
    "path": "sample_scripts/vlog_write_script.py",
    "content": "import torch\nimport os\nos.environ['CURL_CA_BUNDLE'] = ''\nimport argparse\nfrom omegaconf import OmegaConf\nfrom diffusers import DiffusionPipeline\nfrom vlogger.planning_utils.gpt4_utils import (ExtractProtagonist,\n                                               ExtractAProtagonist,\n                                               split_story,\n                                               patch_story_scripts,\n                                               refine_story_scripts,\n                                               protagonist_place_reference1,\n                                               translate_video_script,\n                                               time_scripts,\n                                               )\n\n\ndef main(args):\n    story_path = args.story_path\n    save_script_path = os.path.join(story_path.rsplit('/', 1)[0], \"script\")\n    if not os.path.exists(save_script_path):\n            os.makedirs(save_script_path)\n    with open(story_path, \"r\") as story_file:\n        story = story_file.read()\n        \n    # summerize protagonists and places\n    protagonists_places_file_path = os.path.join(save_script_path, \"protagonists_places.txt\")\n    if args.only_one_protagonist:\n        character_places = ExtractAProtagonist(story, protagonists_places_file_path)\n    else:\n        character_places = ExtractProtagonist(story, protagonists_places_file_path)\n    print(\"Protagonists and places OK\", flush=True)\n    \n    # make script\n    script_file_path = os.path.join(save_script_path, \"video_prompts.txt\")\n    video_list = split_story(story, script_file_path)\n    video_list = patch_story_scripts(story, video_list, script_file_path)\n    video_list = refine_story_scripts(video_list, script_file_path)\n    print(\"Scripts OK\", flush=True)\n    \n    # think about the protagonist in each scene\n    reference_file_path = os.path.join(save_script_path, \"protagonist_place_reference.txt\")\n    reference_lists = protagonist_place_reference1(video_list, character_places, reference_file_path)\n    print(\"Reference protagonist OK\", flush=True)\n    \n    # translate the English script to Chinese\n    zh_file_path = os.path.join(save_script_path, \"zh_video_prompts.txt\")\n    zh_video_list = translate_video_script(video_list, zh_file_path)\n    print(\"Translation OK\", flush=True)\n    \n    # schedule the time of script\n    time_file_path = os.path.join(save_script_path, \"time_scripts.txt\")\n    time_list = time_scripts(video_list, time_file_path)\n    print(\"Time script OK\", flush=True)\n    \n    # make reference image\n    base = DiffusionPipeline.from_pretrained(\"stabilityai/stable-diffusion-xl-base-1.0\", \n                                             torch_dtype=torch.float16, \n                                             variant=\"fp16\", \n                                             use_safetensors=True,\n                                             ).to(\"cuda\")\n    refiner = DiffusionPipeline.from_pretrained(\"stabilityai/stable-diffusion-xl-refiner-1.0\",\n                                                text_encoder_2=base.text_encoder_2,\n                                                vae=base.vae,\n                                                torch_dtype=torch.float16,\n                                                use_safetensors=True,\n                                                variant=\"fp16\",\n                                                ).to(\"cuda\")\n    ref_dir_path = os.path.join(story_path.rsplit('/', 1)[0], \"ref_img\")\n    if not os.path.exists(ref_dir_path):\n            os.makedirs(ref_dir_path)\n    for key, value in character_places.items():\n        prompt = key + \", \" + value\n        img_path = os.path.join(ref_dir_path, key + \".jpg\")\n        image = base(prompt=prompt, \n                     output_type=\"latent\", \n                     height=1024, \n                     width=1024, \n                     guidance_scale=7\n                     ).images[0]\n        image = refiner(prompt=prompt, image=image[None, :]).images[0]\n        image.save(img_path)\n    print(\"Reference image OK\", flush=True)\n    \n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--config\", type=str, default=\"configs/vlog_write_script.yaml\")\n    args = parser.parse_args()\n    omega_conf = OmegaConf.load(args.config)\n    main(omega_conf)\n"
  },
  {
    "path": "sample_scripts/with_mask_ref_sample.py",
    "content": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n# All rights reserved.\n\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n\n\"\"\"\nSample new images from a pre-trained DiT.\n\"\"\"\nimport os\nimport sys\nimport math\ntry:\n    import utils\n    from diffusion import create_diffusion\nexcept:\n    # sys.path.append(os.getcwd())\n    sys.path.append(os.path.split(sys.path[0])[0])\n    # sys.path[0]                 \n    # os.path.split(sys.path[0])    \n    import utils\n\n    from diffusion import create_diffusion\n\nimport torch\ntorch.backends.cuda.matmul.allow_tf32 = True\ntorch.backends.cudnn.allow_tf32 = True\nimport argparse\nimport torchvision\n\nfrom einops import rearrange\nfrom models import get_models\nfrom torchvision.utils import save_image\nfrom diffusers.models import AutoencoderKL\nfrom models.clip import TextEmbedder\nfrom omegaconf import OmegaConf\nfrom PIL import Image\nimport numpy as np\nfrom torchvision import transforms\nsys.path.append(\"..\")\nfrom datasets import video_transforms\nfrom utils import mask_generation_before\nfrom natsort import natsorted\nfrom diffusers.utils.import_utils import is_xformers_available\nfrom vlogger.STEB.model_transform import ip_scale_set, ip_transform_model, tca_transform_model\nfrom transformers import CLIPVisionModelWithProjection, CLIPImageProcessor\n\ndef get_input(args):\n    input_path = args.input_path\n    transform_video = transforms.Compose([\n            video_transforms.ToTensorVideo(), # TCHW\n            video_transforms.ResizeVideo((args.image_h, args.image_w)),\n            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)\n        ])\n    if input_path is not None:\n        print(f'loading video from {input_path}')\n        if os.path.isdir(input_path):\n            file_list = os.listdir(input_path)\n            video_frames = []\n            if args.mask_type.startswith('onelast'):\n                num = int(args.mask_type.split('onelast')[-1])\n                # get first and last frame\n                first_frame_path = os.path.join(input_path, natsorted(file_list)[0])\n                last_frame_path = os.path.join(input_path, natsorted(file_list)[-1])\n                first_frame = torch.as_tensor(np.array(Image.open(first_frame_path), dtype=np.uint8, copy=True)).unsqueeze(0)\n                last_frame = torch.as_tensor(np.array(Image.open(last_frame_path), dtype=np.uint8, copy=True)).unsqueeze(0)\n                for i in range(num):\n                    video_frames.append(first_frame)\n                # add zeros to frames\n                num_zeros = args.num_frames-2*num\n                for i in range(num_zeros):\n                    zeros = torch.zeros_like(first_frame)\n                    video_frames.append(zeros)\n                for i in range(num):\n                    video_frames.append(last_frame)\n                n = 0\n                video_frames = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2) # f,c,h,w\n                video_frames = transform_video(video_frames)\n            else:\n                for file in file_list:\n                    if file.endswith('jpg') or file.endswith('png'):\n                        image = torch.as_tensor(np.array(Image.open(file), dtype=np.uint8, copy=True)).unsqueeze(0)\n                        video_frames.append(image)\n                    else:\n                        continue\n                n = 0\n                video_frames = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2) # f,c,h,w\n                video_frames = transform_video(video_frames)\n            return video_frames, n\n        elif os.path.isfile(input_path):\n            _, full_file_name = os.path.split(input_path)\n            file_name, extention = os.path.splitext(full_file_name)\n            if extention == '.jpg' or extention == '.png':\n                print(\"loading the input image\")\n                video_frames = []\n                num = int(args.mask_type.split('first')[-1])\n                first_frame = torch.as_tensor(np.array(Image.open(input_path), dtype=np.uint8, copy=True)).unsqueeze(0)\n                for i in range(num):\n                    video_frames.append(first_frame)\n                num_zeros = args.num_frames-num\n                for i in range(num_zeros):\n                    zeros = torch.zeros_like(first_frame)\n                    video_frames.append(zeros)\n                n = 0\n                video_frames = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2) # f,c,h,w\n                video_frames = transform_video(video_frames)\n                return video_frames, n\n            else:\n                raise TypeError(f'{extention} is not supported !!')\n        else:\n            raise ValueError('Please check your path input!!')\n    else:\n        print('given video is None, using text to video')\n        video_frames = torch.zeros(16,3,args.latent_h,args.latent_w,dtype=torch.uint8)\n        args.mask_type = 'all'\n        video_frames = transform_video(video_frames)\n        n = 0\n        return video_frames, n\n\ndef auto_inpainting(args, \n                    video_input, \n                    masked_video, \n                    mask, \n                    prompt, \n                    image, \n                    vae, \n                    text_encoder, \n                    image_encoder, \n                    diffusion, \n                    model, \n                    device,\n                    ):\n    image_prompt_embeds = None\n    if prompt is None:\n        prompt = \"\"\n    if image is not None:\n        clip_image = CLIPImageProcessor()(images=image, return_tensors=\"pt\").pixel_values\n        clip_image_embeds = image_encoder(clip_image.to(device)).image_embeds\n        uncond_clip_image_embeds = torch.zeros_like(clip_image_embeds).to(device)\n        image_prompt_embeds = torch.cat([clip_image_embeds, uncond_clip_image_embeds], dim=0)\n        image_prompt_embeds = rearrange(image_prompt_embeds, '(b n) c -> b n c', b=2).contiguous()\n        model = ip_scale_set(model, args.ref_cfg_scale)\n        if args.use_fp16:\n            image_prompt_embeds = image_prompt_embeds.to(dtype=torch.float16)\n    b, f, c, h, w = video_input.shape\n    latent_h = video_input.shape[-2] // 8\n    latent_w = video_input.shape[-1] // 8\n\n    if args.use_fp16:\n        z = torch.randn(1, 4, 16, latent_h, latent_w, dtype=torch.float16, device=device) # b,c,f,h,w\n        masked_video = masked_video.to(dtype=torch.float16)\n        mask = mask.to(dtype=torch.float16)\n    else:\n        z = torch.randn(1, 4, 16, latent_h, latent_w, device=device) # b,c,f,h,w\n\n    masked_video = rearrange(masked_video, 'b f c h w -> (b f) c h w').contiguous()\n    masked_video = vae.encode(masked_video).latent_dist.sample().mul_(0.18215)\n    masked_video = rearrange(masked_video, '(b f) c h w -> b c f h w', b=b).contiguous()\n    mask = torch.nn.functional.interpolate(mask[:,:,0,:], size=(latent_h, latent_w)).unsqueeze(1)\n    masked_video = torch.cat([masked_video] * 2)\n    mask = torch.cat([mask] * 2)\n    z = torch.cat([z] * 2)\n    prompt_all = [prompt] + [args.negative_prompt]\n\n    text_prompt = text_encoder(text_prompts=prompt_all, train=False)\n    model_kwargs = dict(encoder_hidden_states=text_prompt, \n                        class_labels=None, \n                        cfg_scale=args.cfg_scale,\n                        use_fp16=args.use_fp16,\n                        ip_hidden_states=image_prompt_embeds)\n    \n    # Sample images:\n    samples = diffusion.ddim_sample_loop(\n        model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device, \\\n        mask=mask, x_start=masked_video, use_concat=True\n    )\n    samples, _ = samples.chunk(2, dim=0) # [1, 4, 16, 32, 32]\n    if args.use_fp16:\n        samples = samples.to(dtype=torch.float16)\n\n    video_clip = samples[0].permute(1, 0, 2, 3).contiguous() # [16, 4, 32, 32]\n    video_clip = vae.decode(video_clip / 0.18215).sample # [16, 3, 256, 256]\n    return video_clip\n\ndef main(args):\n    # Setup PyTorch:\n    if args.seed:\n        torch.manual_seed(args.seed)\n    torch.set_grad_enabled(False)\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n    # device = \"cpu\"\n\n    if args.ckpt is None:\n        raise ValueError(\"Please specify a checkpoint path using --ckpt <path>\")\n\n    # Load model:\n    latent_h = args.image_size[0] // 8\n    latent_w = args.image_size[1] // 8\n    args.image_h = args.image_size[0]\n    args.image_w = args.image_size[1]\n    args.latent_h = latent_h\n    args.latent_w = latent_w\n    print('loading model')\n    model = get_models(args).to(device)\n    model = tca_transform_model(model).to(device)\n    model = ip_transform_model(model).to(device)\n\n    if args.enable_xformers_memory_efficient_attention:\n        if is_xformers_available():\n            model.enable_xformers_memory_efficient_attention()\n        else:\n            raise ValueError(\"xformers is not available. Make sure it is installed correctly\")\n\n    # load model \n    ckpt_path = args.ckpt \n    state_dict = torch.load(ckpt_path, map_location=lambda storage, loc: storage)['ema']\n    model_dict = model.state_dict()\n    pretrained_dict = {}\n    for k, v in state_dict.items():\n        if k in model_dict:\n            pretrained_dict[k] = v\n    model_dict.update(pretrained_dict)\n    model.load_state_dict(model_dict)\n    \n    model.eval()\n    pretrained_model_path = args.pretrained_model_path\n    diffusion = create_diffusion(str(args.num_sampling_steps))\n    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder=\"vae\").to(device)\n    text_encoder = TextEmbedder(pretrained_model_path).to(device)\n    image_encoder = CLIPVisionModelWithProjection.from_pretrained(args.image_encoder_path).to(device)\n    if args.use_fp16:\n        print('Warnning: using half percision for inferencing!')\n        vae.to(dtype=torch.float16)\n        model.to(dtype=torch.float16)\n        text_encoder.to(dtype=torch.float16)\n\n    # prompt:\n    prompt = args.text_prompt\n    if prompt ==[]:\n        prompt = args.input_path.split('/')[-1].split('.')[0].replace('_', ' ')\n    else:\n        prompt = prompt[0]\n    prompt_base = prompt.replace(' ','_')\n    prompt = prompt + args.additional_prompt\n\n    if not os.path.exists(os.path.join(args.save_path)):\n        os.makedirs(os.path.join(args.save_path))\n    video_input, researve_frames = get_input(args) # f,c,h,w\n    video_input = video_input.to(device).unsqueeze(0) # b,f,c,h,w\n    mask = mask_generation_before(args.mask_type, video_input.shape, video_input.dtype, device) # b,f,c,h,w\n    masked_video = video_input * (mask == 0)\n    \n    pil_image = Image.open(args.ref_path)\n    pil_image.resize((256, 256))\n\n    video_clip = auto_inpainting(args, \n                                 video_input, \n                                 masked_video, \n                                 mask, \n                                 prompt, \n                                 pil_image, \n                                 vae, \n                                 text_encoder, \n                                 image_encoder, \n                                 diffusion, \n                                 model, \n                                 device,\n                                 )\n    video_ = ((video_clip * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1)\n    save_video_path = os.path.join(args.save_path,  prompt_base+ '.mp4')\n    torchvision.io.write_video(save_video_path, video_, fps=8)\n    print(f'save in {save_video_path}')\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--config\", type=str, default=\"configs/with_mask_ref_sample.yaml\")\n    args = parser.parse_args()\n    omega_conf = OmegaConf.load(args.config)\n    main(omega_conf)\n"
  },
  {
    "path": "sample_scripts/with_mask_sample.py",
    "content": "# Copyright (c) Meta Platforms, Inc. and affiliates.\r\n# All rights reserved.\r\n\r\n# This source code is licensed under the license found in the\r\n# LICENSE file in the root directory of this source tree.\r\n\r\n\"\"\"\r\nSample new images from a pre-trained DiT.\r\n\"\"\"\r\nimport os\r\nimport sys\r\nimport math\r\ntry:\r\n    import utils\r\n    from diffusion import create_diffusion\r\nexcept:\r\n    # sys.path.append(os.getcwd())\r\n    sys.path.append(os.path.split(sys.path[0])[0])\r\n    # sys.path[0]                 \r\n    # os.path.split(sys.path[0])    \r\n    import utils\r\n\r\n    from diffusion import create_diffusion\r\n\r\nimport torch\r\ntorch.backends.cuda.matmul.allow_tf32 = True\r\ntorch.backends.cudnn.allow_tf32 = True\r\nimport argparse\r\nimport torchvision\r\n\r\nfrom einops import rearrange\r\nfrom models import get_models\r\nfrom torchvision.utils import save_image\r\nfrom diffusers.models import AutoencoderKL\r\nfrom models.clip import TextEmbedder\r\nfrom omegaconf import OmegaConf\r\nfrom PIL import Image\r\nimport numpy as np\r\nfrom torchvision import transforms\r\nsys.path.append(\"..\")\r\nfrom datasets import video_transforms\r\nfrom utils import mask_generation_before\r\nfrom natsort import natsorted\r\nfrom diffusers.utils.import_utils import is_xformers_available\r\nfrom vlogger.STEB.model_transform import tca_transform_model\r\n\r\ndef get_input(args):\r\n    input_path = args.input_path\r\n    transform_video = transforms.Compose([\r\n            video_transforms.ToTensorVideo(), # TCHW\r\n            video_transforms.ResizeVideo((args.image_h, args.image_w)),\r\n            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)\r\n        ])\r\n    if input_path is not None:\r\n        print(f'loading video from {input_path}')\r\n        if os.path.isdir(input_path):\r\n            file_list = os.listdir(input_path)\r\n            video_frames = []\r\n            if args.mask_type.startswith('onelast'):\r\n                num = int(args.mask_type.split('onelast')[-1])\r\n                # get first and last frame\r\n                first_frame_path = os.path.join(input_path, natsorted(file_list)[0])\r\n                last_frame_path = os.path.join(input_path, natsorted(file_list)[-1])\r\n                first_frame = torch.as_tensor(np.array(Image.open(first_frame_path), dtype=np.uint8, copy=True)).unsqueeze(0)\r\n                last_frame = torch.as_tensor(np.array(Image.open(last_frame_path), dtype=np.uint8, copy=True)).unsqueeze(0)\r\n                for i in range(num):\r\n                    video_frames.append(first_frame)\r\n                # add zeros to frames\r\n                num_zeros = args.num_frames-2*num\r\n                for i in range(num_zeros):\r\n                    zeros = torch.zeros_like(first_frame)\r\n                    video_frames.append(zeros)\r\n                for i in range(num):\r\n                    video_frames.append(last_frame)\r\n                n = 0\r\n                video_frames = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2) # f,c,h,w\r\n                video_frames = transform_video(video_frames)\r\n            else:\r\n                for file in file_list:\r\n                    if file.endswith('jpg') or file.endswith('png'):\r\n                        image = torch.as_tensor(np.array(Image.open(file), dtype=np.uint8, copy=True)).unsqueeze(0)\r\n                        video_frames.append(image)\r\n                    else:\r\n                        continue\r\n                n = 0\r\n                video_frames = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2) # f,c,h,w\r\n                video_frames = transform_video(video_frames)\r\n            return video_frames, n\r\n        elif os.path.isfile(input_path):\r\n            _, full_file_name = os.path.split(input_path)\r\n            file_name, extention = os.path.splitext(full_file_name)\r\n            if extention == '.jpg' or extention == '.png':\r\n                print(\"loading the input image\")\r\n                video_frames = []\r\n                num = int(args.mask_type.split('first')[-1])\r\n                first_frame = torch.as_tensor(np.array(Image.open(input_path), dtype=np.uint8, copy=True)).unsqueeze(0)\r\n                for i in range(num):\r\n                    video_frames.append(first_frame)\r\n                num_zeros = args.num_frames-num\r\n                for i in range(num_zeros):\r\n                    zeros = torch.zeros_like(first_frame)\r\n                    video_frames.append(zeros)\r\n                n = 0\r\n                video_frames = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2) # f,c,h,w\r\n                video_frames = transform_video(video_frames)\r\n                return video_frames, n\r\n            else:\r\n                raise TypeError(f'{extention} is not supported !!')\r\n        else:\r\n            raise ValueError('Please check your path input!!')\r\n    else:\r\n        print('given video is None, using text to video')\r\n        video_frames = torch.zeros(16,3,args.latent_h,args.latent_w,dtype=torch.uint8)\r\n        args.mask_type = 'all'\r\n        video_frames = transform_video(video_frames)\r\n        n = 0\r\n        return video_frames, n\r\n\r\ndef auto_inpainting(args, video_input, masked_video, mask, prompt, vae, text_encoder, diffusion, model, device,):\r\n    b,f,c,h,w=video_input.shape\r\n    latent_h = args.image_size[0] // 8\r\n    latent_w = args.image_size[1] // 8\r\n\r\n    # prepare inputs\r\n    if args.use_fp16:\r\n        z = torch.randn(1, 4, args.num_frames, args.latent_h, args.latent_w, dtype=torch.float16, device=device) # b,c,f,h,w\r\n        masked_video = masked_video.to(dtype=torch.float16)\r\n        mask = mask.to(dtype=torch.float16)\r\n    else:\r\n        z = torch.randn(1, 4, args.num_frames, args.latent_h, args.latent_w, device=device) # b,c,f,h,w\r\n\r\n\r\n    masked_video = rearrange(masked_video, 'b f c h w -> (b f) c h w').contiguous()\r\n    masked_video = vae.encode(masked_video).latent_dist.sample().mul_(0.18215)\r\n    masked_video = rearrange(masked_video, '(b f) c h w -> b c f h w', b=b).contiguous()\r\n    mask = torch.nn.functional.interpolate(mask[:,:,0,:], size=(latent_h, latent_w)).unsqueeze(1)\r\n   \r\n    # classifier_free_guidance\r\n    if args.do_classifier_free_guidance:\r\n        masked_video = torch.cat([masked_video] * 2)\r\n        mask = torch.cat([mask] * 2)\r\n        z = torch.cat([z] * 2)\r\n        prompt_all = [prompt] + [args.negative_prompt]\r\n        \r\n    else:\r\n        masked_video = masked_video\r\n        mask = mask\r\n        z = z\r\n        prompt_all = [prompt]\r\n\r\n    text_prompt = text_encoder(text_prompts=prompt_all, train=False)\r\n    model_kwargs = dict(encoder_hidden_states=text_prompt, \r\n                            class_labels=None, \r\n                            cfg_scale=args.cfg_scale,\r\n                            use_fp16=args.use_fp16,) # tav unet\r\n\r\n    # Sample video:\r\n    if args.sample_method == 'ddim':\r\n        samples = diffusion.ddim_sample_loop(\r\n            model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device, \\\r\n            mask=mask, x_start=masked_video, use_concat=args.use_mask\r\n        )\r\n    elif args.sample_method == 'ddpm':\r\n        samples = diffusion.p_sample_loop(\r\n            model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device, \\\r\n            mask=mask, x_start=masked_video, use_concat=args.use_mask\r\n        )\r\n    samples, _ = samples.chunk(2, dim=0) # [1, 4, 16, 32, 32]\r\n    if args.use_fp16:\r\n        samples = samples.to(dtype=torch.float16)\r\n\r\n    video_clip = samples[0].permute(1, 0, 2, 3).contiguous() # [16, 4, 32, 32]\r\n    video_clip = vae.decode(video_clip / 0.18215).sample # [16, 3, 256, 256]\r\n    return video_clip\r\n\r\ndef main(args):\r\n    # Setup PyTorch:\r\n    if args.seed:\r\n        torch.manual_seed(args.seed)\r\n    torch.set_grad_enabled(False)\r\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\r\n    # device = \"cpu\"\r\n\r\n    if args.ckpt is None:\r\n        raise ValueError(\"Please specify a checkpoint path using --ckpt <path>\")\r\n\r\n    # Load model:\r\n    latent_h = args.image_size[0] // 8\r\n    latent_w = args.image_size[1] // 8\r\n    args.image_h = args.image_size[0]\r\n    args.image_w = args.image_size[1]\r\n    args.latent_h = latent_h\r\n    args.latent_w = latent_w\r\n    print('loading model')\r\n    model = get_models(args).to(device)\r\n    model = tca_transform_model(model).to(device)\r\n\r\n    if args.enable_xformers_memory_efficient_attention:\r\n        if is_xformers_available():\r\n            model.enable_xformers_memory_efficient_attention()\r\n        else:\r\n            raise ValueError(\"xformers is not available. Make sure it is installed correctly\")\r\n\r\n    # load model \r\n    ckpt_path = args.ckpt \r\n    state_dict = torch.load(ckpt_path, map_location=lambda storage, loc: storage)['ema']\r\n    model_dict = model.state_dict()\r\n    pretrained_dict = {}\r\n    for k, v in state_dict.items():\r\n        if k in model_dict:\r\n            pretrained_dict[k] = v\r\n    model_dict.update(pretrained_dict)\r\n    model.load_state_dict(model_dict)\r\n    \r\n    model.eval()\r\n    pretrained_model_path = args.pretrained_model_path\r\n    diffusion = create_diffusion(str(args.num_sampling_steps))\r\n    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder=\"vae\").to(device)\r\n    text_encoder = TextEmbedder(pretrained_model_path).to(device)\r\n    if args.use_fp16:\r\n        print('Warnning: using half percision for inferencing!')\r\n        vae.to(dtype=torch.float16)\r\n        model.to(dtype=torch.float16)\r\n        text_encoder.to(dtype=torch.float16)\r\n\r\n    # prompt:\r\n    prompt = args.text_prompt\r\n    if prompt ==[]:\r\n        prompt = args.input_path.split('/')[-1].split('.')[0].replace('_', ' ')\r\n    else:\r\n        prompt = prompt[0]\r\n    prompt_base = prompt.replace(' ','_')\r\n    prompt = prompt + args.additional_prompt\r\n\r\n    if not os.path.exists(os.path.join(args.save_path)):\r\n        os.makedirs(os.path.join(args.save_path))\r\n    video_input, researve_frames = get_input(args) # f,c,h,w\r\n    video_input = video_input.to(device).unsqueeze(0) # b,f,c,h,w\r\n    mask = mask_generation_before(args.mask_type, video_input.shape, video_input.dtype, device) # b,f,c,h,w\r\n    masked_video = video_input * (mask == 0)\r\n\r\n    video_clip = auto_inpainting(args, video_input, masked_video, mask, prompt, vae, text_encoder, diffusion, model, device,)\r\n    video_ = ((video_clip * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1)\r\n    save_video_path = os.path.join(args.save_path,  prompt_base+ '.mp4')\r\n    torchvision.io.write_video(save_video_path, video_, fps=8)\r\n    print(f'save in {save_video_path}')\r\n\r\n\r\nif __name__ == \"__main__\":\r\n    parser = argparse.ArgumentParser()\r\n    parser.add_argument(\"--config\", type=str, default=\"configs/with_mask_sample.yaml\")\r\n    args = parser.parse_args()\r\n    omega_conf = OmegaConf.load(args.config)\r\n    main(omega_conf)\r\n"
  },
  {
    "path": "utils.py",
    "content": "import os\nimport math\nimport torch\nimport logging\nimport subprocess\nimport numpy as np\nimport torch.distributed as dist\n\n# from torch._six import inf\nfrom torch import inf\nfrom PIL import Image\nfrom typing import Union, Iterable\nfrom collections import OrderedDict\nfrom torch.utils.tensorboard import SummaryWriter   \n_tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]\n\n#################################################################################\n#                             Training Helper Functions                         #\n#################################################################################\ndef fetch_files_by_numbers(start_number, count, file_list):\n    file_numbers = range(start_number, start_number + count)\n    found_files = []\n    for file_number in file_numbers:\n        file_number_padded = str(file_number).zfill(2)\n        for file_name in file_list:\n            if file_name.endswith(file_number_padded + '.csv'):\n                found_files.append(file_name)\n                break  # Stop searching once a file is found for the current number\n    return found_files\n\n#################################################################################\n#                             Training Clip Gradients                           #\n#################################################################################\n\ndef get_grad_norm(\n        parameters: _tensor_or_tensors, norm_type: float = 2.0) -> torch.Tensor:\n    r\"\"\"\n    Copy from torch.nn.utils.clip_grad_norm_\n\n    Clips gradient norm of an iterable of parameters.\n\n    The norm is computed over all gradients together, as if they were\n    concatenated into a single vector. Gradients are modified in-place.\n\n    Args:\n        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a\n            single Tensor that will have gradients normalized\n        max_norm (float or int): max norm of the gradients\n        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for\n            infinity norm.\n        error_if_nonfinite (bool): if True, an error is thrown if the total\n            norm of the gradients from :attr:`parameters` is ``nan``,\n            ``inf``, or ``-inf``. Default: False (will switch to True in the future)\n\n    Returns:\n        Total norm of the parameter gradients (viewed as a single vector).\n    \"\"\"\n    if isinstance(parameters, torch.Tensor):\n        parameters = [parameters]\n    grads = [p.grad for p in parameters if p.grad is not None]\n    norm_type = float(norm_type)\n    if len(grads) == 0:\n        return torch.tensor(0.)\n    device = grads[0].device\n    if norm_type == inf:\n        norms = [g.detach().abs().max().to(device) for g in grads]\n        total_norm = norms[0] if len(norms) == 1 else torch.max(torch.stack(norms))\n    else:\n        total_norm = torch.norm(torch.stack([torch.norm(g.detach(), norm_type).to(device) for g in grads]), norm_type)\n    return total_norm\n\ndef clip_grad_norm_(\n        parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.0,\n        error_if_nonfinite: bool = False, clip_grad = True) -> torch.Tensor:\n    r\"\"\"\n    Copy from torch.nn.utils.clip_grad_norm_\n\n    Clips gradient norm of an iterable of parameters.\n\n    The norm is computed over all gradients together, as if they were\n    concatenated into a single vector. Gradients are modified in-place.\n\n    Args:\n        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a\n            single Tensor that will have gradients normalized\n        max_norm (float or int): max norm of the gradients\n        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for\n            infinity norm.\n        error_if_nonfinite (bool): if True, an error is thrown if the total\n            norm of the gradients from :attr:`parameters` is ``nan``,\n            ``inf``, or ``-inf``. Default: False (will switch to True in the future)\n\n    Returns:\n        Total norm of the parameter gradients (viewed as a single vector).\n    \"\"\"\n    if isinstance(parameters, torch.Tensor):\n        parameters = [parameters]\n    grads = [p.grad for p in parameters if p.grad is not None]\n    max_norm = float(max_norm)\n    norm_type = float(norm_type)\n    if len(grads) == 0:\n        return torch.tensor(0.)\n    device = grads[0].device\n    if norm_type == inf:\n        norms = [g.detach().abs().max().to(device) for g in grads]\n        total_norm = norms[0] if len(norms) == 1 else torch.max(torch.stack(norms))\n    else:\n        total_norm = torch.norm(torch.stack([torch.norm(g.detach(), norm_type).to(device) for g in grads]), norm_type)\n    # print(total_norm)\n\n    if clip_grad:\n        if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()):\n            raise RuntimeError(\n                f'The total norm of order {norm_type} for gradients from '\n                '`parameters` is non-finite, so it cannot be clipped. To disable '\n                'this error and scale the gradients by the non-finite norm anyway, '\n                'set `error_if_nonfinite=False`')\n        clip_coef = max_norm / (total_norm + 1e-6)\n        # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so\n        # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization\n        # when the gradients do not reside in CPU memory.\n        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)\n        for g in grads:\n            g.detach().mul_(clip_coef_clamped.to(g.device))\n        # gradient_cliped = torch.norm(torch.stack([torch.norm(g.detach(), norm_type).to(device) for g in grads]), norm_type)\n        # print(gradient_cliped)\n    return total_norm\n\ndef separation_content_motion(video_clip):\n    \"\"\"\n    separate coontent and motion in a given video\n    Args:\n        video_clip, a give video clip, [B F C H W]\n\n    Return:\n        base frame, [B, 1, C, H, W]\n        motions, [B, F-1, C, H, W], \n        the first is base frame, \n        the second is motions based on base frame\n    \"\"\"\n    total_frames = video_clip.shape[1]\n    base_frame = video_clip[0]\n    motions = [video_clip[i] - base_frame for i in range(1, total_frames)]\n    motions = torch.cat(motions, dim=1)\n    return base_frame, motions\n\ndef get_experiment_dir(root_dir, args):\n    if args.use_compile:\n        root_dir += '-Compile' # speedup by torch compile\n    if args.fixed_spatial:\n        root_dir += '-FixedSpa'\n    if args.enable_xformers_memory_efficient_attention:\n        root_dir += '-Xfor'\n    if args.gradient_checkpointing:\n        root_dir += '-Gc'\n    if args.mixed_precision:\n        root_dir += '-Amp'\n    if args.image_size == 512:\n        root_dir += '-512'\n    return root_dir\n\n#################################################################################\n#                             Training Logger                                   #\n#################################################################################\n\ndef create_logger(logging_dir):\n    \"\"\"\n    Create a logger that writes to a log file and stdout.\n    \"\"\"\n    if dist.get_rank() == 0:  # real logger\n        logging.basicConfig(\n            level=logging.INFO,\n            # format='[\\033[34m%(asctime)s\\033[0m] %(message)s',\n            format='[%(asctime)s] %(message)s',\n            datefmt='%Y-%m-%d %H:%M:%S',\n            handlers=[logging.StreamHandler(), logging.FileHandler(f\"{logging_dir}/log.txt\")]\n        )\n        logger = logging.getLogger(__name__)\n        \n    else:  # dummy logger (does nothing)\n        logger = logging.getLogger(__name__)\n        logger.addHandler(logging.NullHandler())\n    return logger\n\ndef create_accelerate_logger(logging_dir, is_main_process=False):\n    \"\"\"\n    Create a logger that writes to a log file and stdout.\n    \"\"\"\n    if is_main_process:  # real logger\n        logging.basicConfig(\n            level=logging.INFO,\n            # format='[\\033[34m%(asctime)s\\033[0m] %(message)s',\n            format='[%(asctime)s] %(message)s',\n            datefmt='%Y-%m-%d %H:%M:%S',\n            handlers=[logging.StreamHandler(), logging.FileHandler(f\"{logging_dir}/log.txt\")]\n        )\n        logger = logging.getLogger(__name__)\n    else:  # dummy logger (does nothing)\n        logger = logging.getLogger(__name__)\n        logger.addHandler(logging.NullHandler())\n    return logger\n\n\ndef create_tensorboard(tensorboard_dir):\n    \"\"\"\n    Create a tensorboard that saves losses.\n    \"\"\"\n    if dist.get_rank() == 0:  # real tensorboard \n        # tensorboard \n        writer = SummaryWriter(tensorboard_dir)\n\n    return writer\n\ndef write_tensorboard(writer, *args):\n    '''\n    write the loss information to a tensorboard file.\n    Only for pytorch DDP mode.\n    '''\n    if dist.get_rank() == 0:  # real tensorboard\n        writer.add_scalar(args[0], args[1], args[2])\n\n#################################################################################\n#                      EMA Update/ DDP Training Utils                           #\n#################################################################################\n\n@torch.no_grad()\ndef update_ema(ema_model, model, decay=0.9999):\n    \"\"\"\n    Step the EMA model towards the current model.\n    \"\"\"\n    ema_params = OrderedDict(ema_model.named_parameters())\n    model_params = OrderedDict(model.named_parameters())\n\n    for name, param in model_params.items():\n        # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed\n        if param.requires_grad:\n            ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)\n\ndef requires_grad(model, flag=True):\n    \"\"\"\n    Set requires_grad flag for all parameters in a model.\n    \"\"\"\n    for p in model.parameters():\n        p.requires_grad = flag\n\ndef cleanup():\n    \"\"\"\n    End DDP training.\n    \"\"\"\n    dist.destroy_process_group()\n    \n\ndef setup_distributed(backend=\"nccl\", port=None):\n    \"\"\"Initialize distributed training environment.\n    support both slurm and torch.distributed.launch\n    see torch.distributed.init_process_group() for more details\n    \"\"\"\n    num_gpus = torch.cuda.device_count()\n\n    if \"SLURM_JOB_ID\" in os.environ:\n        rank = int(os.environ[\"SLURM_PROCID\"])\n        world_size = int(os.environ[\"SLURM_NTASKS\"])\n        node_list = os.environ[\"SLURM_NODELIST\"]\n        addr = subprocess.getoutput(f\"scontrol show hostname {node_list} | head -n1\")\n        # specify master port\n        if port is not None:\n            os.environ[\"MASTER_PORT\"] = str(port)\n        elif \"MASTER_PORT\" not in os.environ:\n            # os.environ[\"MASTER_PORT\"] = \"29566\"\n            os.environ[\"MASTER_PORT\"] = str(29566 + num_gpus)\n        if \"MASTER_ADDR\" not in os.environ:\n            os.environ[\"MASTER_ADDR\"] = addr\n        os.environ[\"WORLD_SIZE\"] = str(world_size)\n        os.environ[\"LOCAL_RANK\"] = str(rank % num_gpus)\n        os.environ[\"RANK\"] = str(rank)\n    else:\n        rank = int(os.environ[\"RANK\"])\n        world_size = int(os.environ[\"WORLD_SIZE\"])\n\n    # torch.cuda.set_device(rank % num_gpus)\n\n    dist.init_process_group(\n        backend=backend,\n        world_size=world_size,\n        rank=rank,\n    )\n\n#################################################################################\n#                             Testing  Utils                                    #\n#################################################################################\n\ndef save_video_grid(video, nrow=None):\n    b, t, h, w, c = video.shape\n    \n    if nrow is None:\n        nrow = math.ceil(math.sqrt(b))\n    ncol = math.ceil(b / nrow)\n    padding = 1\n    video_grid = torch.zeros((t, (padding + h) * nrow + padding,\n                           (padding + w) * ncol + padding, c), dtype=torch.uint8)\n    \n    print(video_grid.shape)\n    for i in range(b):\n        r = i // ncol\n        c = i % ncol\n        start_r = (padding + h) * r\n        start_c = (padding + w) * c\n        video_grid[:, start_r:start_r + h, start_c:start_c + w] = video[i]\n    \n    return video_grid\n\ndef save_videos_grid_tav(videos: torch.Tensor, path: str, rescale=False, n_rows=4, fps=8):\n    from einops import rearrange\n    import imageio\n    import torchvision\n\n    videos = rearrange(videos, \"b c t h w -> t b c h w\")\n    outputs = []\n    for x in videos:\n        x = torchvision.utils.make_grid(x, nrow=n_rows)\n        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)\n        if rescale:\n            x = (x + 1.0) / 2.0  # -1,1 -> 0,1\n        x = (x * 255).numpy().astype(np.uint8)\n        outputs.append(x)\n\n    # os.makedirs(os.path.dirname(path), exist_ok=True)\n    imageio.mimsave(path, outputs, fps=fps)\n\n\n#################################################################################\n#                             MMCV  Utils                                    #\n#################################################################################\n\n\ndef collect_env():\n    # Copyright (c) OpenMMLab. All rights reserved.\n    from mmcv.utils import collect_env as collect_base_env\n    from mmcv.utils import get_git_hash\n    \"\"\"Collect the information of the running environments.\"\"\"\n    \n    env_info = collect_base_env()\n    env_info['MMClassification'] = get_git_hash()[:7]\n\n    for name, val in env_info.items():\n        print(f'{name}: {val}')\n    \n    print(torch.cuda.get_arch_list())\n    print(torch.version.cuda)\n\n\n#################################################################################\n#                      Long video generation  Utils                             #\n#################################################################################\n    \ndef mask_generation_before(mask_type, shape, dtype, device, dropout_prob=0.0, use_image_num=0):\n    b, f, c, h, w = shape\n    if mask_type.startswith('first'):\n        num = int(mask_type.split('first')[-1])\n        mask_f = torch.cat([torch.zeros(1, num, 1, 1, 1, dtype=dtype, device=device),\n                           torch.ones(1, f-num, 1, 1, 1, dtype=dtype, device=device)], dim=1)\n        mask = mask_f.expand(b, -1, c, h, w)\n    elif mask_type.startswith('all'):\n        mask = torch.ones(b,f,c,h,w,dtype=dtype,device=device)\n    elif mask_type.startswith('onelast'):\n        num = int(mask_type.split('onelast')[-1])\n        mask_one = torch.zeros(1,1,1,1,1, dtype=dtype, device=device)\n        mask_mid = torch.ones(1,f-2*num,1,1,1,dtype=dtype, device=device)\n        mask_last = torch.zeros_like(mask_one)\n        mask = torch.cat([mask_one]*num + [mask_mid] + [mask_last]*num, dim=1)\n        mask = mask.expand(b, -1, c, h, w)\n    else:\n        raise ValueError(f\"Invalid mask type: {mask_type}\")\n    return mask\n"
  },
  {
    "path": "vlogger/STEB/model_transform.py",
    "content": "import torch\n# import argparse\n# from omegaconf import OmegaConf\n# from models import get_models\n# import sys\n# import os\n# from PIL import Image\n# from copy import deepcopy\n\n\ndef tca_transform_model(model):\n    for down_block in model.down_blocks:\n        try:\n            for attention in down_block.attentions:\n                attention.transformer_blocks[0].tca_transform()\n                attention.transformer_blocks[0].tca_transform()\n        except:\n            continue\n    for attention in model.mid_block.attentions:\n        attention.transformer_blocks[0].tca_transform()\n        attention.transformer_blocks[0].tca_transform()\n    for up_block in model.up_blocks:\n        try:\n            for attention in up_block.attentions:\n                attention.transformer_blocks[0].tca_transform()\n                attention.transformer_blocks[0].tca_transform()\n        except:\n            continue\n    return model\n\n\nclass ImageProjModel(torch.nn.Module):\n    \"\"\"Projection Model\"\"\"\n    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):\n        super().__init__()\n        \n        self.cross_attention_dim = cross_attention_dim\n        self.clip_extra_context_tokens = clip_extra_context_tokens\n        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)\n        self.norm = torch.nn.LayerNorm(cross_attention_dim)\n        \n    def forward(self, image_embeds):\n        embeds = image_embeds\n        clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)\n        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)\n        return clip_extra_context_tokens\n    \n\ndef ip_transform_model(model):\n    model.image_proj_model = ImageProjModel(cross_attention_dim=768, clip_embeddings_dim=1024,\n                                            clip_extra_context_tokens=4).to(model.device)\n    for down_block in model.down_blocks:\n        try:\n            for attention in down_block.attentions:\n                attention.transformer_blocks[0].attn2.ip_transform()\n                attention.transformer_blocks[0].attn2.ip_transform()\n        except:\n            continue\n    for attention in model.mid_block.attentions:\n        attention.transformer_blocks[0].attn2.ip_transform()\n        attention.transformer_blocks[0].attn2.ip_transform()\n    for up_block in model.up_blocks:\n        try:\n            for attention in up_block.attentions:\n                attention.transformer_blocks[0].attn2.ip_transform()\n                attention.transformer_blocks[0].attn2.ip_transform()\n        except:\n            continue\n    return model\n\n\ndef ip_scale_set(model, scale):\n    for down_block in model.down_blocks:\n        try:\n            for attention in down_block.attentions:\n                attention.transformer_blocks[0].attn2.set_scale(scale)\n                attention.transformer_blocks[0].attn2.set_scale(scale)\n        except:\n            continue\n    for attention in model.mid_block.attentions:\n        attention.transformer_blocks[0].attn2.set_scale(scale)\n        attention.transformer_blocks[0].attn2.set_scale(scale)\n    for up_block in model.up_blocks:\n        try:\n            for attention in up_block.attentions:\n                attention.transformer_blocks[0].attn2.set_scale(scale)\n                attention.transformer_blocks[0].attn2.set_scale(scale)\n        except:\n            continue\n    return model\n\n\ndef ip_train_set(model):\n    model.requires_grad_(False)\n    model.image_proj_model.requires_grad_(True)\n    for down_block in model.down_blocks:\n        try:\n            for attention in down_block.attentions:\n                attention.transformer_blocks[0].attn2.ip_train_set()\n                attention.transformer_blocks[0].attn2.ip_train_set()\n        except:\n            continue\n    for attention in model.mid_block.attentions:\n        attention.transformer_blocks[0].attn2.ip_train_set()\n        attention.transformer_blocks[0].attn2.ip_train_set()\n    for up_block in model.up_blocks:\n        try:\n            for attention in up_block.attentions:\n                attention.transformer_blocks[0].attn2.ip_train_set()\n                attention.transformer_blocks[0].attn2.ip_train_set()\n        except:\n            continue\n    return model\n"
  },
  {
    "path": "vlogger/planning_utils/gpt4_utils.py",
    "content": "import openai\nimport re\nimport ast\n\n# Enter your openai key here\n# Allow multiple keys to be filled in to prevent the number of visits from being restricted\nopenai_key = [\"\"]\n\nglobal_key_num = 0\n\n\ndef smart_openai_key():\n    global global_key_num\n    global openai_key\n    openai.api_key = openai_key[global_key_num]\n    global_key_num += 1\n    global_key_num %= len(openai_key)\n\n\ndef json_completion(prompt):\n    try_time = 3\n    for i in range(try_time):\n        try:\n            smart_openai_key()\n            completions = openai.ChatCompletion.create(\n                model=\"gpt-4\",  # \"gpt-3.5-turbo\",\n                messages=[\n                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n                    {\"role\": \"user\", \"content\": prompt},\n                    {\"role\": \"assistant\", \"content\": \"[\"}\n                ]\n            )\n            break\n        except:\n            print(\"key error: \", openai.api_key)\n\n    message = completions['choices'][0]['message']['content']\n    return message\n\n\ndef ExtractProtagonist(story, file_path):\n    ask = f\"\"\"The following is a story enclosed in three single quotes '''{story}''', please help me summarize all the main protagonists and places that appear in the story.\"\"\" \\\n          f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: id, name, description.\"\"\" \\\n          f\"\"\"You must answer like the content in following three single quotes:\"\"\" \\\n          f\"\"\"'''[\n    {{\n        \"id\": 1,\n        \"name\": \"Lincoln\",\n        \"description\": \"(the physical characteristics description of Lincoln)\",\n    }},\n    {{\n        \"id\": 2,\n        \"name\": \"Everest\"\n        \"description\": \"(the physical characteristics description of Everest)\",\n    }}\n]'''\"\"\" \\\n          f\"\"\"The descriptions of the protagonists should adhere to the following guidelines:\\n\"\"\" \\\n          f\"\"\"1.The description should be as simple as possible, as long as it doesn't conflict with the story\\n\"\"\" \\\n          f\"\"\"2.Do not include another thing in the description of one thing\\n\"\"\" \\\n          f\"\"\"3.Most important: Only the physical characteristics of the character or place need to be described in detail, such as color and class label, no mood description is required, etc. \"\"\"\n    \n    answer = json_completion(ask)\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = \"[\\n\" + answer + \"\\n]\"\n    f = open(file_path, \"w\")\n    f.write(answer)\n    f.close()\n    protagonists_places_dict = {}\n    protagonists_places = ast.literal_eval(answer)\n    for protagonist_place in protagonists_places:\n        protagonists_places_dict[protagonist_place[\"name\"]] = protagonist_place[\"description\"]\n    return protagonists_places_dict\n\n\ndef ExtractAProtagonist(story, file_path):\n    ask = f\"\"\"The following is a story enclosed in three single quotes '''{story}''', please help me summarize a main protagonist that appear in the story.\"\"\" \\\n          f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: id, name, description.\"\"\" \\\n          f\"\"\"You must answer like the content in following three single quotes:\"\"\" \\\n          f\"\"\"'''[\n    {{\n        \"id\": 1,\n        \"name\": \"Lincoln\",\n        \"description\": \"(the physical characteristics description of Lincoln)\",\n    }}\n]'''\"\"\" \\\n          f\"\"\"The descriptions of the protagonist should adhere to the following guidelines:\\n\"\"\" \\\n          f\"\"\"1.The description should be as simple as possible, as long as it doesn't conflict with the story\\n\"\"\" \\\n          f\"\"\"2.Most important: Only the physical characteristics of the character need to be described in detail, such as color and class label, no mood description is required, etc. \"\"\"\n    \n    answer = json_completion(ask)\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = \"[\\n\" + answer + \"\\n]\"\n    f = open(file_path, \"w\")\n    f.write(answer)\n    f.close()\n    protagonists_places_dict = {}\n    protagonists_places = ast.literal_eval(answer)\n    for protagonist_place in protagonists_places:\n        protagonists_places_dict[protagonist_place[\"name\"]] = protagonist_place[\"description\"]\n    return protagonists_places_dict\n\n\ndef protagonist_place_reference(video_list, character_places):\n    new_video_list = []\n    num = 1\n    for video in video_list:\n        prompt = str(num) + \". \" + video\n        new_video_list.append(prompt)\n        num += 1\n    key_list = []\n    i = 1\n    for key, value in character_places.items():\n        key_list.append(str(i) + \". \" + key)\n        i += 1\n    ask = f\"\"\"I would like to make a video. Here are this video script in the following three single quotes '''{new_video_list}''', \"\"\" \\\n          f\"\"\"Here are some characters and places in the following three single quotes '''{key_list}''', \"\"\" \\\n          f\"\"\"Please help me identify the characters or places in the list where each segment of the video script appears\"\"\" \\\n          f\"\"\"You can only choose characters and places that match exactly, and you can't choose even the slightest doubt.\"\"\" \\\n          f\"\"\"Just answer me the serial number(2 selections are possible, but no more, pick out what you think is most likely. If you select less than 2, you can fill it with 0.)\"\"\" \\\n          f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: video segment id, character/place id.\"\"\" \\\n          f\"\"\"You must answer like the content in following three single quotes:\"\"\" \\\n          f\"\"\"'''[\n    {{\n        \"video segment id\": 1,\n        \"character/place id\": [1, 0],\n    }},\n    {{\n        \"video segment id\": 2,\n        \"character/place id\": [1, 2],\n    }},\n    {{\n        \"video segment id\": 3,\n        \"character/place id\": [0, 0],\n    }}\n]'''\"\"\"\n\n    answer = json_completion(ask)\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = \"[\\n\" + answer + \"\\n]\"\n    f = open(\"MakeVideo/protagonist_place_reference.txt\", \"w\")\n    f.write(answer)\n    f.close()\n    print(answer)\n    reference_list = []\n    protagonists_places_reference = ast.literal_eval(answer)\n    for protagonist_place_reference in protagonists_places_reference:\n        reference_list.append(protagonist_place_reference[\"character/place id\"])\n    return reference_list\n\n\ndef protagonist_place_reference1(video_list, character_places, file_path):\n    new_video_list = []\n    num = 1\n    for video in video_list:\n        prompt = str(num) + \". \" + video\n        new_video_list.append(prompt)\n        num += 1\n    key_list = []\n    i = 1\n    for key, value in character_places.items():\n        key_list.append(str(i) + \". \" + key)\n        i += 1\n    ask = f\"\"\"I would like to make a video. Here are this video script in the following three single quotes '''{new_video_list}''', \"\"\" \\\n          f\"\"\"Here are some characters and places in the following three single quotes '''{key_list}''', \"\"\" \\\n          f\"\"\"Please help me identify the characters or places in the list where each segment of the video script appears\"\"\" \\\n          f\"\"\"You can only choose characters and places that match exactly, and you can't choose even the slightest doubt.\"\"\" \\\n          f\"\"\"Just answer me the serial number(1 selection is possible, but no more, pick out what you think is most likely. If you select less than 1, you can fill it with 0.)\"\"\" \\\n          f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: video segment id, character/place id.\"\"\" \\\n          f\"\"\"You must answer like the content in following three single quotes:\"\"\" \\\n          f\"\"\"'''[\n        {{\n            \"video segment id\": 1,\n            \"character/place id\": [1],\n        }},\n        {{\n            \"video segment id\": 2,\n            \"character/place id\": [0],\n        }},\n        {{\n            \"video segment id\": 3,\n            \"character/place id\": [2],\n        }}\n        ]'''\"\"\"\n\n    answer = json_completion(ask)\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = \"[\\n\" + answer + \"\\n]\"\n    f = open(file_path, \"w\")\n    f.write(answer)\n    f.close()\n    # print(answer)\n    reference_list = []\n    protagonists_places_reference = ast.literal_eval(answer)\n    for i, prompt in enumerate(video_list):\n        prompt = prompt.lower()\n        for key, value in character_places.items():\n            if key.lower() in prompt:\n                protagonists_places_reference[i][\"character/place id\"] = [1]\n    for protagonist_place_reference in protagonists_places_reference:\n        reference_list.append(protagonist_place_reference[\"character/place id\"])\n    return reference_list\n\n\ndef split_story(story, file_path):\n    ask = f\"\"\"The following is a story enclosed in three single quotes '''{story}''' and I would like to request your assistance in \"\"\" \\\n        f\"\"\"writing a script for a video based on this story. Provide the script in JSON format(do not answer anything else) with the following keys: video fragment id, video fragment description.\"\"\" \\\n        f\"\"\"You must answer like the content in following three single quotes:\\n\"\"\" \\\n        f\"\"\"'''[\n{{\n\"video fragment id\": 1,\n\"video fragment description\": \"(the description, describe the characters, actions, and backgrounds in the video fragment)\",\n}},\n{{\n\"video fragment id\": 2,\n\"video fragment description\": \"(the description, describe the characters, actions, and backgrounds in the video fragment)\",\n}}\n]'''\"\"\" \\\n        f\"\"\"The descriptions of the video segments should adhere to the following guidelines:\\n\"\"\" \\\n        f\"\"\"1.Fits the original storyline\\n\"\"\" \\\n        f\"\"\"2.All video fragment descriptions cannot conflict with each other, and the descriptions corresponding to successive fragments in the original story must have a certain continuity\\n\"\"\" \\\n        f\"\"\"3.The description only needs to describe the visual elements presented, such as the subject, action, background, etc., and do not appear useless descriptions, such as mental activities\\n\"\"\" \\\n        f\"\"\"Each description should include the subject, place, and action as much as possible.\"\"\" \\\n        f\"\"\"Read this script carefully and don't pull down any details.\\n\"\"\" \\\n        f\"\"\"As more fragment as possible, as detail as possible!\\n\"\"\" \\\n\n    answer = json_completion(ask)\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = \"[\\n\" + answer + \"\\n]\"\n    f = open(file_path, \"w\")\n    f.write(answer)\n    f.close()\n    video_fragments = ast.literal_eval(answer)\n    video_list = []\n    for video_fragment in video_fragments:\n        video_list.append(video_fragment[\"video fragment description\"])\n    return video_list\n\n\ndef patch_story_scripts(story, video_list, file_path):\n    ask = f\"\"\"The following is a story enclosed in three single quotes '''{story}'''. I want to make a video according to this story, this is my video production script in the following three single quotes '''{video_list}''', a paragraph in the script corresponds to a clip \"\"\" \\\n          f\"\"\"of the video. However, there may be some plots in the story missing, such as important plot missing, or transitions between pictures, please check and complete it for me. \"\"\" \\\n          f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: video fragment id, video fragment description.\"\"\" \\\n          f\"\"\"You must must must answer like the content in following three single quotes:\\n\"\"\" \\\n          f\"\"\"'''[\n    {{\n        \"video fragment id\": 1,\n        \"video fragment description\": \"(the description)\",\n    }},\n    {{\n        \"video fragment id\": 2,\n        \"video fragment description\": \"(the description)\",\n    }},\n    {{\n        \"video fragment id\": 3,\n        \"video fragment description\": \"(the description)\",\n    }}\n]'''\"\"\" \\\n          f\"\"\"Remember to make sure that the description of each video clip is not long, no more than fifteen words, but there can be so many video clips.\\n\"\"\" \\\n          f\"\"\"Each description should include the subject, place, and action as much as possible.\"\"\" \\\n          f\"\"\"As more fragment as possible, as detail as possible!\\n\"\"\" \\\n          f\"\"\"Read this script carefully and don't pull down any details.\\n\"\"\"\n        #   f\"\"\"Very important!!!: avoid character-to-character interactions and character-to-object interactions in descriptions.\"\"\"\n\n    answer = json_completion(ask)\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = \"[\\n\" + answer + \"\\n]\"\n    f = open(file_path, \"w\")\n    f.write(answer)\n    f.close()\n    video_fragments = ast.literal_eval(answer)\n    video_list = []\n    for video_fragment in video_fragments:\n        video_list.append(video_fragment[\"video fragment description\"])\n    return video_list\n\n\ndef refine_story_scripts(video_list, file_path):\n    ask = f\"\"\"I want to make a video, this is my video production script in the following three single quotes '''{video_list}''', a paragraph in the script corresponds to a clip \"\"\" \\\n          f\"\"\"of the video, But the description of some video clips is too complicated, please help me analyze and rewrite a video script, split each description into at least three short descriptions and as more as possible. \"\"\" \\\n          f\"\"\"For example, if there are one paragraphs in the script I gave you, then you should split it into fifteen paragraphs.\"\"\" \\\n          f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: video fragment id, video fragment description.\"\"\" \\\n          f\"\"\"You must must must answer like the content in following three single quotes:\\n\"\"\" \\\n          f\"\"\"'''[\n    {{\n        \"video fragment id\": 1,\n        \"video fragment description\": \"(the description)\",\n    }},\n    {{\n        \"video fragment id\": 2,\n        \"video fragment description\": \"(the description)\",\n    }},\n    {{\n        \"video fragment id\": 3,\n        \"video fragment description\": \"(the description)\",\n    }}\n]'''\"\"\" \\\n          f\"\"\"Remember to make sure that the description of each video clip is not long, no more than ten words, but there can be so many video clips.\\n\"\"\" \\\n          f\"\"\"Most important thing: Read this script carefully and don't pull down any details.\\n\"\"\" \\\n          f\"\"\"Ensure that all description statements are as natural and syntactically correct as possible.\\n\"\"\" \\\n          f\"\"\"Most important: Try to have only one character in the description and avoid complex actions in video fragment description, such as: loaded in, fight, etc.\\n\"\"\" \n\n    answer = json_completion(ask)\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"\\n\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = answer.strip(\"'\")\n    answer = answer.strip(\"[\")\n    answer = answer.strip(\"]\")\n    answer = \"[\\n\" + answer + \"\\n]\"\n    f = open(file_path, \"w\")\n    f.write(answer)\n    f.close()\n    video_fragments = ast.literal_eval(answer)\n    video_list = []\n    for video_fragment in video_fragments:\n        video_list.append(video_fragment[\"video fragment description\"])\n    return video_list\n\n\ndef time_scripts(video_list, file_path):\n    try_times = 3\n    for i in range(try_times):\n        try:\n            new_video_list = []\n            num = 1\n            for video in video_list:\n                prompt = str(num) + \". \" + video\n                new_video_list.append(prompt)\n                num += 1\n            ask = f\"\"\"I want to make a video, this is my video production script in the following three single quotes '''{new_video_list}''', a paragraph in the script corresponds to a clip \"\"\" \\\n                f\"\"\"of the video, Now that you know that 16-frame videos have a length of 2 seconds, please help me plan how much time it will take for each video clip to fully interpret the meaning of the script.\"\"\" \\\n                f\"\"\"Each clip can only be 10 seconds maximum.\"\"\" \\\n                f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: video fragment id, time.\"\"\" \\\n                f\"\"\"You must answer like the content in following three single quotes:\\n\"\"\" \\\n                f\"\"\"'''[\n                {{\n                    \"video fragment id\": 1,\n                    \"time\": 3,\n                }},\n                {{\n                    \"video fragment id\": 2,\n                    \"time\": 9,\n                }},\n                {{\n                    \"video fragment id\": 3,\n                    \"time\": 7,\n                }},\n                {{\n                    \"video fragment id\": 4,\n                    \"time\": 2,\n                }},\n                ]'''\"\"\" \\\n                f\"\"\"Remember that time must be less than 10.\"\"\"\n            answer = json_completion(ask)\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = \"[\\n\" + answer + \"\\n]\"\n            f = open(file_path, \"w\")\n            f.write(answer)\n            f.close()\n            time_scripts = ast.literal_eval(answer)\n            time_list = []\n            for time_script in time_scripts:\n                time = time_script[\"time\"]\n                if time > 10:\n                    time = 10\n                time_list.append(time)\n            assert len(time_list) == len(video_list)\n            return time_list\n        except:\n            continue\n    assert len(time_list) == len(video_list)\n    return time_list\n\n\ndef translate_video_script(video_list, file_path):\n    try_times = 5\n    for i in range(try_times):\n        try:\n            ask = f\"\"\"I want to make a video, this is my video production script in the following three single quotes '''{video_list}''', \"\"\" \\\n                f\"\"\"please help me to translate every video fragment description into Chinese.\"\"\" \\\n                f\"\"\"Provide me the answer in JSON format(do not answer anything else) with the following keys: 序号, 描述.\"\"\" \\\n                f\"\"\"You must must must answer like the content in following three single quotes:\\n\"\"\" \\\n                f\"\"\"'''[\n                {{\n                    \"序号\": 1,\n                    \"描述\": \"(视频片段描述)\",\n                }},\n                {{\n                    \"序号\": 2,\n                    \"描述\": \"(视频片段描述)\",\n                }}\n                ]'''\"\"\"\n\n            answer = json_completion(ask)\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = answer.strip(\"'\")\n            answer = answer.strip(\"[\")\n            answer = answer.strip(\"]\")\n            answer = \"[\\n\" + answer + \"\\n]\"\n            f = open(file_path, \"w\")\n            f.write(answer)\n            f.close()\n            video_fragments = ast.literal_eval(answer)\n            zh_video_list = []\n            for video_fragment in video_fragments:\n                zh_video_list.append(video_fragment[\"描述\"])\n            assert len(video_list) == len(zh_video_list)\n            return zh_video_list\n        except:\n            continue\n    assert len(video_list) == len(zh_video_list)\n    return zh_video_list\n\n\ndef readscript(script_file_path):\n    with open(script_file_path, \"r\", encoding='utf-8') as f: \n        script = f.read()\n        video_fragments = ast.literal_eval(script)\n        video_list = []\n        for video_fragment in video_fragments:\n            video_list.append(video_fragment[\"video fragment description\"])\n    return video_list\n\n\ndef readzhscript(zh_file_path):\n    with open(zh_file_path, \"r\", encoding='utf-8') as f: \n        script = f.read()\n        video_fragments = ast.literal_eval(script)\n        video_list = []\n        for video_fragment in video_fragments:\n            video_list.append(video_fragment[\"描述\"])\n    return video_list\n\n\ndef readtimescript(time_file_path):\n    with open(time_file_path, \"r\", encoding='utf-8') as f: \n        time_scripts = f.read()\n        time_scripts = ast.literal_eval(time_scripts)\n        time_list = []\n        for time_script in time_scripts:\n            frames = time_script[\"time\"]\n            time_list.append(frames)\n    return time_list\n    \n    \ndef readprotagonistscript(protagonist_file_path):\n    with open(protagonist_file_path, \"r\", encoding='utf-8') as f: \n        protagonist_scripts = f.read()\n        protagonist_scripts = ast.literal_eval(protagonist_scripts)\n        protagonists_places_dict = {}\n        for protagonist_script in protagonist_scripts:\n            protagonists_places_dict[protagonist_script[\"name\"]] = protagonist_script[\"description\"]\n    return protagonists_places_dict\n    \n    \ndef readreferencescript(video_list, character_places, reference_file_path):\n    new_video_list = []\n    num = 1\n    for video in video_list:\n        prompt = str(num) + \". \" + video\n        new_video_list.append(prompt)\n        num += 1\n    key_list = []\n    i = 1\n    for key, value in character_places.items():\n        key_list.append(str(i) + \". \" + key)\n    with open(reference_file_path, \"r\", encoding='utf-8') as f: \n        reference_file = f.read()\n        reference_list = []\n        protagonists_places_reference = ast.literal_eval(reference_file)\n        for i, prompt in enumerate(video_list):\n            prompt = prompt.lower()\n            for j, key in enumerate(key_list):\n                if key.lower() in prompt:\n                    protagonists_places_reference[i][\"character/place id\"] = [j + 1]\n            \n        for protagonist_place_reference in protagonists_places_reference:\n            reference_list.append(protagonist_place_reference[\"character/place id\"])\n    return reference_list\n    \n"
  },
  {
    "path": "vlogger/videoaudio.py",
    "content": "import os\nimport ast\nfrom IPython.display import Audio\nimport nltk  # we'll use this to split into sentences\nimport numpy as np\n\nfrom bark import generate_audio, SAMPLE_RATE\nfrom moviepy.editor import concatenate_videoclips, concatenate_audioclips\nfrom moviepy.editor import VideoFileClip, AudioFileClip, AudioClip, CompositeAudioClip\nimport librosa\nimport soundfile as sf\nimport math\n\n\ndef make_audio(en_prompt_file, output_dir):\n    print(\"Begin to make the aside!\")\n    prompt_list = []\n    with open(en_prompt_file, 'r', encoding='utf-8') as f:\n        video_prompts = f.read()\n        video_fragments = ast.literal_eval(video_prompts)\n        for video_fragment in video_fragments:\n            prompt_list.append(video_fragment[\"video fragment description\"])\n\n    if not os.path.exists(output_dir):\n        os.makedirs(output_dir)\n    for i, prompt in enumerate(prompt_list):\n        sentences = nltk.sent_tokenize(prompt)\n        SPEAKER = \"v2/en_speaker_1\"\n        silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence\n\n        pieces = []\n        for sentence in sentences:\n            audio_array = generate_audio(sentence, history_prompt=SPEAKER)\n            # audio_array = generate_audio(sentence)\n            pieces += [audio_array, silence.copy()]\n        audio = Audio(np.concatenate(pieces), rate=SAMPLE_RATE)\n        with open(os.path.join(output_dir, str(i) + \".wav\"), 'w+b') as f:\n            f.write(audio.data)\n\n\ndef merge_video_audio(video_dir, audio_dir, output_dir):\n    video_fnames = []\n    for fname in os.listdir(video_dir):\n        if not fname.startswith(\"result\"):\n            video_fnames.append(fname)\n    audio_fnames = []\n    for fname in os.listdir(audio_dir):\n        if not fname.startswith(\"result\") and not fname.startswith(\"fast\"):\n            audio_fnames.append(fname)\n    video_fnames.sort(key=lambda x: int(x.split('.')[0]))\n    audio_fnames.sort(key=lambda x: int(x.split('.')[0]))\n    assert len(video_fnames) == len(audio_fnames), 'The number of videos is not equal to audios.'\n    if not os.path.exists(output_dir):\n        os.makedirs(output_dir)\n    audios = []\n    for i, (video_fname, audio_fname) in enumerate(zip(video_fnames, audio_fnames)):\n        video = VideoFileClip(os.path.join(video_dir, video_fname))\n        audio = AudioFileClip(os.path.join(audio_dir, audio_fname))\n\n        video_duration = video.duration\n        audio_duration = audio.duration\n        if audio_duration > video_duration:\n            y, sr = librosa.load(os.path.join(audio_dir, audio_fname))\n            speed_change = audio_duration / video_duration\n            y_stretched = librosa.effects.time_stretch(y, rate=speed_change)\n            sf.write(os.path.join(audio_dir, \"fast_video.wav\"), y_stretched, sr)\n            audio = AudioFileClip(os.path.join(audio_dir, \"fast_video.wav\"))\n        else:\n            silence_len = math.ceil(video_duration * audio.fps) / audio.fps  # make sure the silence duration not less than required\n            silence = AudioClip(lambda t: [0] * audio.nchannels, duration=silence_len, fps=audio.fps)\n            audio = CompositeAudioClip([audio, silence])\n        \n        audios.append(audio)\n        video = video.set_audio(audio)\n        video.write_videofile(os.path.join(output_dir, str(i) + \".mp4\"))\n    final_audio = concatenate_audioclips(audios)\n    final_audio.write_audiofile(os.path.join(audio_dir, \"result\" + \".wav\"))\n\n\n\ndef concatenate_videos(video_dir, output_dir=None):\n    if output_dir is None:\n        output_dir = video_dir\n    video_fnames = []\n    for fname in os.listdir(video_dir):\n        if not fname.startswith(\"result\") and not fname.startswith(\"audio\"):\n            video_fnames.append(fname)\n    video_fnames.sort(key=lambda x: int(x.split('.')[0]))\n    if not os.path.exists(output_dir):\n        os.makedirs(output_dir)\n    \n    video_clips = [VideoFileClip(os.path.join(video_dir, video_fname)) for video_fname in video_fnames]\n    audio_clips = [video.audio for video in video_clips]\n\n    final_video = concatenate_videoclips(video_clips, method=\"compose\")\n    final_audio = concatenate_audioclips(audio_clips)\n\n    final_clip = final_video.set_audio(final_audio)\n    final_clip.write_videofile(os.path.join(output_dir, \"result.mp4\"))\n"
  },
  {
    "path": "vlogger/videocaption.py",
    "content": "import torch\nimport ast\nimport os\nimport cv2 as cv\nfrom PIL import Image, ImageDraw, ImageFont\nfrom decord import VideoReader, cpu\nimport torchvision\nimport numpy as np\n\n\ndef captioning(en_prompt_file, zh_prompt_file, input_video_dir, output_video_dir):\n    prompt_list = []\n    with open(en_prompt_file, 'r', encoding='utf-8') as f:\n        video_prompts = f.read()\n        video_fragments = ast.literal_eval(video_prompts)\n        for video_fragment in video_fragments:\n            prompt_list.append(video_fragment[\"video fragment description\"])\n            \n    video_fnames = []\n    for fname in os.listdir(input_video_dir):\n        try:\n            int(fname.split('.')[0])\n            video_fnames.append(fname)\n        except:\n            continue\n    video_fnames.sort(key=lambda x: int(x.split('.')[0]))\n\n    font_face = cv.FONT_HERSHEY_COMPLEX\n    if not os.path.exists(output_video_dir):\n        os.makedirs(output_video_dir)\n    for i in range(len(video_fnames)):\n        font_zh = ImageFont.truetype(font='MSYH.TTC', size=18)\n        fontScale = 0.4\n        video_path = os.path.join(input_video_dir, video_fnames[i])\n        video = VideoReader(video_path, ctx=cpu(0))\n        video = video[:].asnumpy()\n        (fw, fh), bh = cv.getTextSize(prompt_list[i], font_face, fontScale, 1)\n        pos_en = (int((video[0].shape[1] - fw) / 2), 300)\n        if pos_en[0] < 0:\n            scale = video[0].shape[1] / fw\n            fontScale *= scale\n            pos_en = (0, 300)\n        for j in range(video.shape[0]):\n            cv.putText(video[j], prompt_list[i], pos_en, font_face, fontScale, (255, 255, 255), 1, cv.LINE_AA)\n            img = Image.fromarray(cv.cvtColor(video[j], cv.COLOR_BGR2RGB))\n            draw = ImageDraw.Draw(img)\n            img = np.array(img)\n            video[j] = cv.cvtColor(img, cv.COLOR_RGB2BGR)\n        torchvision.io.write_video(output_video_dir + \"/\" + str(i) + '.mp4', video, fps=8)\n    print(\"Caption OK\", flush=True)\n\n"
  },
  {
    "path": "vlogger/videofusion.py",
    "content": "import torch\nimport os\nfrom decord import VideoReader, cpu\nimport numpy as np\nimport torchvision\n\n\ndef fusion(path):\n    fnames = []\n    for fname in os.listdir(path):\n        if not fname.startswith(\"result\"):\n            fnames.append(fname)\n    fnames.sort(key=lambda x: int(x.split('.')[0]))\n    for i, fname in enumerate(fnames):\n        fpath = os.path.join(path, fname)\n        video = VideoReader(fpath, ctx=cpu(0))\n        video = video[:].asnumpy()\n        if i == 0:\n            result = video\n        else:\n            result = np.concatenate((result, video), axis=0)\n    torchvision.io.write_video(path + \"/\" + \"result\" + '.mp4', result, fps=8)\n\n"
  }
]