Showing preview only (7,345K chars total). Download the full file or copy to clipboard to get everything.
Repository: KwaiVGI/3DTrajMaster
Branch: main
Commit: 0af2932ffe51
Files: 276
Total size: 6.9 MB
Directory structure:
gitextract_mycvvgja/
├── CogVideo/
│ ├── .github/
│ │ ├── ISSUE_TEMPLATE/
│ │ │ ├── bug_report.yaml
│ │ │ └── feature-request.yaml
│ │ └── PULL_REQUEST_TEMPLATE/
│ │ └── pr_template.md
│ ├── .gitignore
│ ├── LICENSE
│ ├── MODEL_LICENSE
│ ├── README.md
│ ├── README_ja.md
│ ├── README_zh.md
│ ├── download.sh
│ ├── finetune/
│ │ ├── README.md
│ │ ├── README_ja.md
│ │ ├── README_zh.md
│ │ ├── accelerate_config_machine_single.yaml
│ │ ├── accelerate_config_machine_single_debug.yaml
│ │ ├── finetune_single_rank_injector.sh
│ │ ├── finetune_single_rank_lora.sh
│ │ ├── hostfile.txt
│ │ ├── models/
│ │ │ ├── attention.py
│ │ │ ├── attention_processor.py
│ │ │ ├── cogvideox_transformer_3d.py
│ │ │ ├── embeddings.py
│ │ │ ├── pipeline_cogvideox.py
│ │ │ ├── pipeline_output.py
│ │ │ └── utils.py
│ │ ├── train_cogvideox_injector.py
│ │ └── train_cogvideox_lora.py
│ ├── inference/
│ │ ├── 3dtrajmaster_inference.py
│ │ ├── entity_zoo.txt
│ │ └── location_zoo.txt
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── tools/
│ │ ├── caption/
│ │ │ ├── README.md
│ │ │ ├── README_ja.md
│ │ │ ├── README_zh.md
│ │ │ ├── requirements.txt
│ │ │ └── video_caption.py
│ │ ├── convert_weight_sat2hf.py
│ │ ├── export_sat_lora_weight.py
│ │ ├── llm_flux_cogvideox/
│ │ │ ├── generate.sh
│ │ │ ├── gradio_page.py
│ │ │ └── llm_flux_cogvideox.py
│ │ ├── load_cogvideox_lora.py
│ │ ├── parallel_inference/
│ │ │ ├── parallel_inference_xdit.py
│ │ │ └── run.sh
│ │ ├── replicate/
│ │ │ ├── cog.yaml
│ │ │ ├── predict_i2v.py
│ │ │ └── predict_t2v.py
│ │ └── venhancer/
│ │ ├── README.md
│ │ ├── README_ja.md
│ │ └── README_zh.md
│ └── weights/
│ └── put weights here.txt
├── README.md
├── dataset/
│ ├── load_dataset.py
│ ├── traj_vis/
│ │ ├── D_loc1_61_t3n13_003d_Hemi12_1.json
│ │ ├── Hemi12_transforms.json
│ │ └── location_data_desert.json
│ ├── utils.py
│ └── vis_trajectory.py
└── eval/
├── GVHMR/
│ ├── .gitignore
│ ├── .gitmodules
│ ├── LICENSE
│ ├── README.md
│ ├── docs/
│ │ └── INSTALL.md
│ ├── download_eval_pose.sh
│ ├── eval.sh
│ ├── hmr4d/
│ │ ├── __init__.py
│ │ ├── build_gvhmr.py
│ │ ├── configs/
│ │ │ ├── __init__.py
│ │ │ ├── data/
│ │ │ │ └── mocap/
│ │ │ │ ├── testY.yaml
│ │ │ │ └── trainX_testY.yaml
│ │ │ ├── demo.yaml
│ │ │ ├── exp/
│ │ │ │ └── gvhmr/
│ │ │ │ └── mixed/
│ │ │ │ └── mixed.yaml
│ │ │ ├── global/
│ │ │ │ ├── debug/
│ │ │ │ │ ├── debug_train.yaml
│ │ │ │ │ └── debug_train_limit_data.yaml
│ │ │ │ └── task/
│ │ │ │ └── gvhmr/
│ │ │ │ ├── test_3dpw.yaml
│ │ │ │ ├── test_3dpw_emdb_rich.yaml
│ │ │ │ ├── test_emdb.yaml
│ │ │ │ └── test_rich.yaml
│ │ │ ├── hydra/
│ │ │ │ └── default.yaml
│ │ │ ├── siga24_release.yaml
│ │ │ ├── store_gvhmr.py
│ │ │ └── train.yaml
│ │ ├── datamodule/
│ │ │ └── mocap_trainX_testY.py
│ │ ├── dataset/
│ │ │ ├── bedlam/
│ │ │ │ ├── bedlam.py
│ │ │ │ ├── resource/
│ │ │ │ │ └── vname2lwh.pt
│ │ │ │ └── utils.py
│ │ │ ├── emdb/
│ │ │ │ ├── emdb_motion_test.py
│ │ │ │ └── utils.py
│ │ │ ├── h36m/
│ │ │ │ ├── camera-parameters.json
│ │ │ │ ├── h36m.py
│ │ │ │ └── utils.py
│ │ │ ├── imgfeat_motion/
│ │ │ │ └── base_dataset.py
│ │ │ ├── pure_motion/
│ │ │ │ ├── amass.py
│ │ │ │ ├── base_dataset.py
│ │ │ │ ├── cam_traj_utils.py
│ │ │ │ └── utils.py
│ │ │ ├── rich/
│ │ │ │ ├── resource/
│ │ │ │ │ ├── cam2params.pt
│ │ │ │ │ ├── seqname2imgrange.json
│ │ │ │ │ ├── test.txt
│ │ │ │ │ ├── train.txt
│ │ │ │ │ ├── val.txt
│ │ │ │ │ └── w2az_sahmr.json
│ │ │ │ ├── rich_motion_test.py
│ │ │ │ └── rich_utils.py
│ │ │ └── threedpw/
│ │ │ ├── threedpw_motion_test.py
│ │ │ ├── threedpw_motion_train.py
│ │ │ └── utils.py
│ │ ├── model/
│ │ │ ├── common_utils/
│ │ │ │ ├── optimizer.py
│ │ │ │ ├── scheduler.py
│ │ │ │ └── scheduler_cfg.py
│ │ │ └── gvhmr/
│ │ │ ├── callbacks/
│ │ │ │ ├── metric_3dpw.py
│ │ │ │ ├── metric_emdb.py
│ │ │ │ └── metric_rich.py
│ │ │ ├── gvhmr_pl.py
│ │ │ ├── gvhmr_pl_demo.py
│ │ │ ├── pipeline/
│ │ │ │ └── gvhmr_pipeline.py
│ │ │ └── utils/
│ │ │ ├── endecoder.py
│ │ │ ├── postprocess.py
│ │ │ └── stats_compose.py
│ │ ├── network/
│ │ │ ├── base_arch/
│ │ │ │ ├── embeddings/
│ │ │ │ │ └── rotary_embedding.py
│ │ │ │ └── transformer/
│ │ │ │ ├── encoder_rope.py
│ │ │ │ └── layer.py
│ │ │ ├── gvhmr/
│ │ │ │ └── relative_transformer.py
│ │ │ └── hmr2/
│ │ │ ├── __init__.py
│ │ │ ├── components/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pose_transformer.py
│ │ │ │ └── t_cond_mlp.py
│ │ │ ├── configs/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── model_config.yaml
│ │ │ │ └── smpl_mean_params.npz
│ │ │ ├── hmr2.py
│ │ │ ├── smpl_head.py
│ │ │ ├── utils/
│ │ │ │ ├── geometry.py
│ │ │ │ ├── preproc.py
│ │ │ │ └── smpl_wrapper.py
│ │ │ └── vit.py
│ │ └── utils/
│ │ ├── body_model/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── body_model.py
│ │ │ ├── body_model_smplh.py
│ │ │ ├── body_model_smplx.py
│ │ │ ├── coco_aug_dict.pth
│ │ │ ├── min_lbs.py
│ │ │ ├── seg_part_info.npy
│ │ │ ├── smpl_3dpw14_J_regressor_sparse.pt
│ │ │ ├── smpl_coco17_J_regressor.pt
│ │ │ ├── smpl_lite.py
│ │ │ ├── smpl_neutral_J_regressor.pt
│ │ │ ├── smpl_vert_segmentation.json
│ │ │ ├── smplx2smpl_sparse.pt
│ │ │ ├── smplx_lite.py
│ │ │ ├── smplx_verts437.pt
│ │ │ └── utils.py
│ │ ├── callbacks/
│ │ │ ├── lr_monitor.py
│ │ │ ├── prog_bar.py
│ │ │ ├── simple_ckpt_saver.py
│ │ │ └── train_speed_timer.py
│ │ ├── comm/
│ │ │ └── gather.py
│ │ ├── eval/
│ │ │ └── eval_utils.py
│ │ ├── geo/
│ │ │ ├── augment_noisy_pose.py
│ │ │ ├── flip_utils.py
│ │ │ ├── hmr_cam.py
│ │ │ ├── hmr_global.py
│ │ │ ├── quaternion.py
│ │ │ └── transforms.py
│ │ ├── geo_transform.py
│ │ ├── ik/
│ │ │ └── ccd_ik.py
│ │ ├── kpts/
│ │ │ └── kp2d_utils.py
│ │ ├── matrix.py
│ │ ├── net_utils.py
│ │ ├── preproc/
│ │ │ ├── __init__.py
│ │ │ ├── slam.py
│ │ │ ├── tracker.py
│ │ │ ├── vitfeat_extractor.py
│ │ │ ├── vitpose.py
│ │ │ └── vitpose_pytorch/
│ │ │ ├── __init__.py
│ │ │ └── src/
│ │ │ └── vitpose_infer/
│ │ │ ├── __init__.py
│ │ │ ├── builder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── backbones/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── alexnet.py
│ │ │ │ │ ├── cpm.py
│ │ │ │ │ ├── hourglass.py
│ │ │ │ │ ├── hourglass_ae.py
│ │ │ │ │ ├── hrformer.py
│ │ │ │ │ ├── litehrnet.py
│ │ │ │ │ ├── mobilenet_v2.py
│ │ │ │ │ ├── mobilenet_v3.py
│ │ │ │ │ ├── mspn.py
│ │ │ │ │ ├── regnet.py
│ │ │ │ │ ├── resnest.py
│ │ │ │ │ ├── resnext.py
│ │ │ │ │ ├── rsn.py
│ │ │ │ │ ├── scnet.py
│ │ │ │ │ ├── seresnet.py
│ │ │ │ │ ├── seresnext.py
│ │ │ │ │ ├── shufflenet_v1.py
│ │ │ │ │ ├── shufflenet_v2.py
│ │ │ │ │ ├── tcn.py
│ │ │ │ │ ├── test_torch.py
│ │ │ │ │ ├── utils/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── channel_shuffle.py
│ │ │ │ │ │ ├── inverted_residual.py
│ │ │ │ │ │ ├── make_divisible.py
│ │ │ │ │ │ ├── se_layer.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── vgg.py
│ │ │ │ │ ├── vipnas_mbv3.py
│ │ │ │ │ ├── vipnas_resnet.py
│ │ │ │ │ └── vit.py
│ │ │ │ ├── configs/
│ │ │ │ │ └── coco/
│ │ │ │ │ ├── ViTPose_base_coco_256x192.py
│ │ │ │ │ ├── ViTPose_base_simple_coco_256x192.py
│ │ │ │ │ ├── ViTPose_huge_coco_256x192.py
│ │ │ │ │ ├── ViTPose_huge_simple_coco_256x192.py
│ │ │ │ │ ├── ViTPose_large_coco_256x192.py
│ │ │ │ │ ├── ViTPose_large_simple_coco_256x192.py
│ │ │ │ │ └── __init__.py
│ │ │ │ ├── heads/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── deconv_head.py
│ │ │ │ │ ├── deeppose_regression_head.py
│ │ │ │ │ ├── hmr_head.py
│ │ │ │ │ ├── interhand_3d_head.py
│ │ │ │ │ ├── temporal_regression_head.py
│ │ │ │ │ ├── topdown_heatmap_base_head.py
│ │ │ │ │ ├── topdown_heatmap_multi_stage_head.py
│ │ │ │ │ ├── topdown_heatmap_simple_head.py
│ │ │ │ │ ├── vipnas_heatmap_simple_head.py
│ │ │ │ │ └── voxelpose_head.py
│ │ │ │ └── model_builder.py
│ │ │ ├── model_builder.py
│ │ │ └── pose_utils/
│ │ │ ├── ViTPose_trt.py
│ │ │ ├── __init__.py
│ │ │ ├── convert_to_trt.py
│ │ │ ├── general_utils.py
│ │ │ ├── inference_test.py
│ │ │ ├── logger_helper.py
│ │ │ ├── pose_utils.py
│ │ │ ├── pose_viz.py
│ │ │ ├── timerr.py
│ │ │ └── visualizer.py
│ │ ├── pylogger.py
│ │ ├── seq_utils.py
│ │ ├── smplx_utils.py
│ │ ├── video_io_utils.py
│ │ ├── vis/
│ │ │ ├── README.md
│ │ │ ├── cv2_utils.py
│ │ │ ├── renderer.py
│ │ │ ├── renderer_tools.py
│ │ │ ├── renderer_utils.py
│ │ │ └── rich_logger.py
│ │ └── wis3d_utils.py
│ ├── pyproject.toml
│ ├── pyrightconfig.json
│ ├── requirements.txt
│ ├── setup.py
│ └── tools/
│ ├── demo/
│ │ ├── colab_demo.ipynb
│ │ ├── demo.py
│ │ └── demo_folder.py
│ ├── eval_pose.py
│ ├── train.py
│ ├── unitest/
│ │ ├── make_hydra_cfg.py
│ │ └── run_dataset.py
│ └── video/
│ ├── merge_folder.py
│ ├── merge_horizontal.py
│ └── merge_vertical.py
└── common_metrics_on_video_quality/
├── .gitignore
├── README.md
├── calculate_clip.py
├── calculate_fvd.py
├── calculate_fvd_styleganv.py
├── calculate_lpips.py
├── calculate_psnr.py
├── calculate_ssim.py
├── download_eval_visual.sh
├── eval_prompts.json
└── eval_visual.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: CogVideo/.github/ISSUE_TEMPLATE/bug_report.yaml
================================================
name: "\U0001F41B Bug Report"
description: Submit a bug report to help us improve CogVideoX / 提交一个 Bug 问题报告来帮助我们改进 CogVideoX 开源模型
body:
- type: textarea
id: system-info
attributes:
label: System Info / 系統信息
description: Your operating environment / 您的运行环境信息
placeholder: Includes Cuda version, Diffusers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本,Diffusers,Python版本,操作系统,硬件信息(如果您怀疑是硬件方面的问题)...
validations:
required: true
- type: checkboxes
id: information-scripts-examples
attributes:
label: Information / 问题信息
description: 'The problem arises when using: / 问题出现在'
options:
- label: "The official example scripts / 官方的示例脚本"
- label: "My own modified scripts / 我自己修改的脚本和任务"
- type: textarea
id: reproduction
validations:
required: true
attributes:
label: Reproduction / 复现过程
description: |
Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit.
If you have code snippets, error messages, stack traces, please provide them here as well.
Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code.
请提供能重现您遇到的问题的代码示例,最好是最小复现单元。
如果您有代码片段、错误信息、堆栈跟踪,也请在此提供。
请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
请勿使用截图,因为截图难以阅读,而且(更重要的是)不允许他人复制粘贴您的代码。
placeholder: |
Steps to reproduce the behavior/复现Bug的步骤:
1.
2.
3.
- type: textarea
id: expected-behavior
validations:
required: true
attributes:
label: Expected behavior / 期待表现
description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。"
================================================
FILE: CogVideo/.github/ISSUE_TEMPLATE/feature-request.yaml
================================================
name: "\U0001F680 Feature request"
description: Submit a request for a new CogVideoX feature / 提交一个新的 CogVideoX开源模型的功能建议
labels: [ "feature" ]
body:
- type: textarea
id: feature-request
validations:
required: true
attributes:
label: Feature request / 功能建议
description: |
A brief description of the functional proposal. Links to corresponding papers and code are desirable.
对功能建议的简述。最好提供对应的论文和代码链接。
- type: textarea
id: motivation
validations:
required: true
attributes:
label: Motivation / 动机
description: |
Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here.
您提出建议的动机。如果该动机与另一个 GitHub 问题有关,请在此处提供对应的链接。
- type: textarea
id: contribution
validations:
required: true
attributes:
label: Your contribution / 您的贡献
description: |
Your PR link or any other link you can help with.
您的PR链接或者其他您能提供帮助的链接。
================================================
FILE: CogVideo/.github/PULL_REQUEST_TEMPLATE/pr_template.md
================================================
# Raise valuable PR / 提出有价值的PR
## Caution / 注意事项:
Users should keep the following points in mind when submitting PRs:
1. Ensure that your code meets the requirements in the [specification](../../resources/contribute.md).
2. the proposed PR should be relevant, if there are multiple ideas and optimizations, they should be assigned to different PRs.
用户在提交PR时候应该注意以下几点:
1. 确保您的代码符合 [规范](../../resources/contribute_zh.md) 中的要求。
2. 提出的PR应该具有针对性,如果具有多个不同的想法和优化方案,应该分配到不同的PR中。
## 不应该提出的PR / PRs that should not be proposed
If a developer proposes a PR about any of the following, it may be closed or Rejected.
1. those that don't describe improvement options.
2. multiple issues of different types combined in one PR.
3. The proposed PR is highly duplicative of already existing PRs.
如果开发者提出关于以下方面的PR,则可能会被直接关闭或拒绝通过。
1. 没有说明改进方案的。
2. 多个不同类型的问题合并在一个PR中的。
3. 提出的PR与已经存在的PR高度重复的。
# 检查您的PR
- [ ] Have you read the Contributor Guidelines, Pull Request section? / 您是否阅读了贡献者指南、Pull Request 部分?
- [ ] Has this been discussed/approved via a Github issue or forum? If so, add a link. / 是否通过 Github 问题或论坛讨论/批准过?如果是,请添加链接。
- [ ] Did you make sure you updated the documentation with your changes? Here are the Documentation Guidelines, and here are the Documentation Formatting Tips. /您是否确保根据您的更改更新了文档?这里是文档指南,这里是文档格式化技巧。
- [ ] Did you write new required tests? / 您是否编写了新的必要测试?
- [ ] Are your PRs for only one issue / 您的PR是否仅针对一个问题
================================================
FILE: CogVideo/.gitignore
================================================
*__pycache__/
samples*/
runs/
checkpoints/
master_ip
logs/
*.DS_Store
.idea
output*
test*
================================================
FILE: CogVideo/LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2024 CogVideo Model Team @ Zhipu AI
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: CogVideo/MODEL_LICENSE
================================================
The CogVideoX License
1. Definitions
“Licensor” means the CogVideoX Model Team that distributes its Software.
“Software” means the CogVideoX model parameters made available under this license.
2. License Grant
Under the terms and conditions of this license, the licensor hereby grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license. The intellectual property rights of the generated content belong to the user to the extent permitted by applicable local laws.
This license allows you to freely use all open-source models in this repository for academic research. Users who wish to use the models for commercial purposes must register and obtain a basic commercial license in https://open.bigmodel.cn/mla/form .
Users who have registered and obtained the basic commercial license can use the models for commercial activities for free, but must comply with all terms and conditions of this license. Additionally, the number of service users (visits) for your commercial activities must not exceed 1 million visits per month.
If the number of service users (visits) for your commercial activities exceeds 1 million visits per month, you need to contact our business team to obtain more commercial licenses.
The above copyright statement and this license statement should be included in all copies or significant portions of this software.
3. Restriction
You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any military, or illegal purposes.
You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
4. Disclaimer
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
5. Limitation of Liability
EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
6. Dispute Resolution
This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at license@zhipuai.cn.
1. 定义
“许可方”是指分发其软件的 CogVideoX 模型团队。
“软件”是指根据本许可提供的 CogVideoX 模型参数。
2. 许可授予
根据本许可的条款和条件,许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。生成内容的知识产权所属,可根据适用当地法律的规定,在法律允许的范围内由用户享有生成内容的知识产权或其他权利。
本许可允许您免费使用本仓库中的所有开源模型进行学术研究。对于希望将模型用于商业目的的用户,需在 https://open.bigmodel.cn/mla/form 完成登记并获得基础商用授权。
经过登记并获得基础商用授权的用户可以免费使用本模型进行商业活动,但必须遵守本许可的所有条款和条件。
在本许可证下,您的商业活动的服务用户数量(访问量)不得超过100万人次访问 / 每月。如果超过,您需要与我们的商业团队联系以获得更多的商业许可。
上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
3.限制
您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
4.免责声明
本软件“按原样”提供,不提供任何明示或暗示的保证,包括但不限于对适销性、特定用途的适用性和非侵权性的保证。
在任何情况下,作者或版权持有人均不对任何索赔、损害或其他责任负责,无论是在合同诉讼、侵权行为还是其他方面,由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
5. 责任限制
除适用法律禁止的范围外,在任何情况下且根据任何法律理论,无论是基于侵权行为、疏忽、合同、责任或其他原因,任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害,或任何其他商业损失,即使许可人已被告知此类损害的可能性。
6.争议解决
本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
请注意,许可证可能会更新到更全面的版本。 有关许可和版权的任何问题,请通过 license@zhipuai.cn 与我们联系。
================================================
FILE: CogVideo/README.md
================================================
# CogVideo & CogVideoX
[中文阅读](./README_zh.md)
[日本語で読む](./README_ja.md)
<div align="center">
<img src=resources/logo.svg width="50%"/>
</div>
<p align="center">
Experience the CogVideoX-5B model online at <a href="https://huggingface.co/spaces/THUDM/CogVideoX-5B" target="_blank"> 🤗 Huggingface Space</a> or <a href="https://modelscope.cn/studios/ZhipuAI/CogVideoX-5b-demo" target="_blank"> 🤖 ModelScope Space</a>
</p>
<p align="center">
📚 View the <a href="https://arxiv.org/abs/2408.06072" target="_blank">paper</a> and <a href="https://zhipu-ai.feishu.cn/wiki/DHCjw1TrJiTyeukfc9RceoSRnCh" target="_blank">user guide</a>
</p>
<p align="center">
👋 Join our <a href="resources/WECHAT.md" target="_blank">WeChat</a> and <a href="https://discord.gg/dCGfUsagrD" target="_blank">Discord</a>
</p>
<p align="center">
📍 Visit <a href="https://chatglm.cn/video?lang=en?fr=osm_cogvideo">QingYing</a> and <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9">API Platform</a> to experience larger-scale commercial video generation models.
</p>
## Project Updates
- 🔥🔥 **News**: ```2024/10/13```: A more cost-effective fine-tuning framework for `CogVideoX-5B` that works with a single
4090 GPU, [cogvideox-factory](https://github.com/a-r-r-o-w/cogvideox-factory), has been released. It supports
fine-tuning with multiple resolutions. Feel free to use it!
- 🔥 **News**: ```2024/10/10```: We have updated our technical report. Please
click [here](https://arxiv.org/pdf/2408.06072) to view it. More training details and a demo have been added. To see
the demo, click [here](https://yzy-thu.github.io/CogVideoX-demo/).- 🔥 **News**: ```2024/10/09```: We have publicly
released the [technical documentation](https://zhipu-ai.feishu.cn/wiki/DHCjw1TrJiTyeukfc9RceoSRnCh) for CogVideoX
fine-tuning on Feishu, further increasing distribution flexibility. All examples in the public documentation can be
fully reproduced.
- 🔥 **News**: ```2024/9/19```: We have open-sourced the CogVideoX series image-to-video model **CogVideoX-5B-I2V**.
This model can take an image as a background input and generate a video combined with prompt words, offering greater
controllability. With this, the CogVideoX series models now support three tasks: text-to-video generation, video
continuation, and image-to-video generation. Welcome to try it online
at [Experience](https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space).
- 🔥 ```2024/9/19```: The Caption
model [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption), used in the training process of
CogVideoX to convert video data into text descriptions, has been open-sourced. Welcome to download and use it.
- 🔥 ```2024/8/27```: We have open-sourced a larger model in the CogVideoX series, **CogVideoX-5B**. We have
significantly optimized the model's inference performance, greatly lowering the inference threshold. You can run *
*CogVideoX-2B** on older GPUs like `GTX 1080TI`, and **CogVideoX-5B** on desktop GPUs like `RTX 3060`. Please strictly
follow the [requirements](requirements.txt) to update and install dependencies, and refer
to [cli_demo](inference/cli_demo.py) for inference code. Additionally, the open-source license for the **CogVideoX-2B
** model has been changed to the **Apache 2.0 License**.
- 🔥 ```2024/8/6```: We have open-sourced **3D Causal VAE**, used for **CogVideoX-2B**, which can reconstruct videos with
almost no loss.
- 🔥 ```2024/8/6```: We have open-sourced the first model of the CogVideoX series video generation models, **CogVideoX-2B
**.
- 🌱 **Source**: ```2022/5/19```: We have open-sourced the CogVideo video generation model (now you can see it in
the `CogVideo` branch). This is the first open-source large Transformer-based text-to-video generation model. You can
access the [ICLR'23 paper](https://arxiv.org/abs/2205.15868) for technical details.
## Table of Contents
Jump to a specific section:
- [Quick Start](#Quick-Start)
- [SAT](#sat)
- [Diffusers](#Diffusers)
- [CogVideoX-2B Video Works](#cogvideox-2b-gallery)
- [Introduction to the CogVideoX Model](#Model-Introduction)
- [Full Project Structure](#project-structure)
- [Inference](#inference)
- [SAT](#sat)
- [Tools](#tools)
- [Introduction to CogVideo(ICLR'23) Model](#cogvideoiclr23)
- [Citations](#Citation)
- [Open Source Project Plan](#Open-Source-Project-Plan)
- [Model License](#Model-License)
## Quick Start
### Prompt Optimization
Before running the model, please refer to [this guide](inference/convert_demo.py) to see how we use large models like
GLM-4 (or other comparable products, such as GPT-4) to optimize the model. This is crucial because the model is trained
with long prompts, and a good prompt directly impacts the quality of the video generation.
### SAT
**Please make sure your Python version is between 3.10 and 3.12, inclusive of both 3.10 and 3.12.**
Follow instructions in [sat_demo](sat/README.md): Contains the inference code and fine-tuning code of SAT weights. It is
recommended to improve based on the CogVideoX model structure. Innovative researchers use this code to better perform
rapid stacking and development.
### Diffusers
**Please make sure your Python version is between 3.10 and 3.12, inclusive of both 3.10 and 3.12.**
```
pip install -r requirements.txt
```
Then follow [diffusers_demo](inference/cli_demo.py): A more detailed explanation of the inference code, mentioning the
significance of common parameters.
For more details on quantized inference, please refer
to [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao/). With Diffusers and TorchAO, quantized inference
is also possible leading to memory-efficient inference as well as speedup in some cases when compiled. A full list of
memory and time benchmarks with various settings on A100 and H100 has been published
at [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao).
## Gallery
### CogVideoX-5B
<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
<tr>
<td>
<video src="https://github.com/user-attachments/assets/cf5953ea-96d3-48fd-9907-c4708752c714" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/fe0a78e6-b669-4800-8cf0-b5f9b5145b52" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/c182f606-8f8c-421d-b414-8487070fcfcb" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/7db2bbce-194d-434d-a605-350254b6c298" width="100%" controls autoplay loop></video>
</td>
</tr>
<tr>
<td>
<video src="https://github.com/user-attachments/assets/62b01046-8cab-44cc-bd45-4d965bb615ec" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/d78e552a-4b3f-4b81-ac3f-3898079554f6" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/30894f12-c741-44a2-9e6e-ddcacc231e5b" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/926575ca-7150-435b-a0ff-4900a963297b" width="100%" controls autoplay loop></video>
</td>
</tr>
</table>
### CogVideoX-2B
<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
<tr>
<td>
<video src="https://github.com/user-attachments/assets/ea3af39a-3160-4999-90ec-2f7863c5b0e9" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/9de41efd-d4d1-4095-aeda-246dd834e91d" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/941d6661-6a8d-4a1b-b912-59606f0b2841" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/938529c4-91ae-4f60-b96b-3c3947fa63cb" width="100%" controls autoplay loop></video>
</td>
</tr>
</table>
To view the corresponding prompt words for the gallery, please click [here](resources/galary_prompt.md)
## Model Introduction
CogVideoX is an open-source version of the video generation model originating
from [QingYing](https://chatglm.cn/video?lang=en?fr=osm_cogvideo). The table below displays the list of video generation
models we currently offer, along with their foundational information.
<table style="border-collapse: collapse; width: 100%;">
<tr>
<th style="text-align: center;">Model Name</th>
<th style="text-align: center;">CogVideoX-2B</th>
<th style="text-align: center;">CogVideoX-5B</th>
<th style="text-align: center;">CogVideoX-5B-I2V</th>
</tr>
<tr>
<td style="text-align: center;">Model Description</td>
<td style="text-align: center;">Entry-level model, balancing compatibility. Low cost for running and secondary development.</td>
<td style="text-align: center;">Larger model with higher video generation quality and better visual effects.</td>
<td style="text-align: center;">CogVideoX-5B image-to-video version.</td>
</tr>
<tr>
<td style="text-align: center;">Inference Precision</td>
<td style="text-align: center;"><b>FP16*(recommended)</b>, BF16, FP32, FP8*, INT8, not supported: INT4</td>
<td colspan="2" style="text-align: center;"><b>BF16 (recommended)</b>, FP16, FP32, FP8*, INT8, not supported: INT4</td>
</tr>
<tr>
<td style="text-align: center;">Single GPU Memory Usage<br></td>
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: from 4GB* </b><br><b>diffusers INT8 (torchao): from 3.6GB*</b></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16: from 5GB* </b><br><b>diffusers INT8 (torchao): from 4.4GB*</b></td>
</tr>
<tr>
<td style="text-align: center;">Multi-GPU Inference Memory Usage</td>
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
</tr>
<tr>
<td style="text-align: center;">Inference Speed<br>(Step = 50, FP/BF16)</td>
<td style="text-align: center;">Single A100: ~90 seconds<br>Single H100: ~45 seconds</td>
<td colspan="2" style="text-align: center;">Single A100: ~180 seconds<br>Single H100: ~90 seconds</td>
</tr>
<tr>
<td style="text-align: center;">Fine-tuning Precision</td>
<td style="text-align: center;"><b>FP16</b></td>
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
</tr>
<tr>
<td style="text-align: center;">Fine-tuning Memory Usage</td>
<td style="text-align: center;">47 GB (bs=1, LORA)<br> 61 GB (bs=2, LORA)<br> 62GB (bs=1, SFT)</td>
<td style="text-align: center;">63 GB (bs=1, LORA)<br> 80 GB (bs=2, LORA)<br> 75GB (bs=1, SFT)<br></td>
<td style="text-align: center;">78 GB (bs=1, LORA)<br> 75GB (bs=1, SFT, 16GPU)<br></td>
</tr>
<tr>
<td style="text-align: center;">Prompt Language</td>
<td colspan="3" style="text-align: center;">English*</td>
</tr>
<tr>
<td style="text-align: center;">Maximum Prompt Length</td>
<td colspan="3" style="text-align: center;">226 Tokens</td>
</tr>
<tr>
<td style="text-align: center;">Video Length</td>
<td colspan="3" style="text-align: center;">6 Seconds</td>
</tr>
<tr>
<td style="text-align: center;">Frame Rate</td>
<td colspan="3" style="text-align: center;">8 Frames / Second</td>
</tr>
<tr>
<td style="text-align: center;">Video Resolution</td>
<td colspan="3" style="text-align: center;">720 x 480, no support for other resolutions (including fine-tuning)</td>
</tr>
<tr>
<td style="text-align: center;">Position Encoding</td>
<td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
</tr>
<tr>
<td style="text-align: center;">Download Link (Diffusers)</td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
</tr>
<tr>
<td style="text-align: center;">Download Link (SAT)</td>
<td colspan="3" style="text-align: center;"><a href="./sat/README.md">SAT</a></td>
</tr>
</table>
**Data Explanation**
+ While testing using the diffusers library, all optimizations included in the diffusers library were enabled. This
scheme has not been tested for actual memory usage on devices outside of **NVIDIA A100 / H100** architectures.
Generally, this scheme can be adapted to all **NVIDIA Ampere architecture** and above devices. If optimizations are
disabled, memory consumption will multiply, with peak memory usage being about 3 times the value in the table.
However, speed will increase by about 3-4 times. You can selectively disable some optimizations, including:
```
pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
```
+ For multi-GPU inference, the `enable_sequential_cpu_offload()` optimization needs to be disabled.
+ Using INT8 models will slow down inference, which is done to accommodate lower-memory GPUs while maintaining minimal
video quality loss, though inference speed will significantly decrease.
+ The CogVideoX-2B model was trained in `FP16` precision, and all CogVideoX-5B models were trained in `BF16` precision.
We recommend using the precision in which the model was trained for inference.
+ [PytorchAO](https://github.com/pytorch/ao) and [Optimum-quanto](https://github.com/huggingface/optimum-quanto/) can be
used to quantize the text encoder, transformer, and VAE modules to reduce the memory requirements of CogVideoX. This
allows the model to run on free T4 Colabs or GPUs with smaller memory! Also, note that TorchAO quantization is fully
compatible with `torch.compile`, which can significantly improve inference speed. FP8 precision must be used on
devices with NVIDIA H100 and above, requiring source installation of `torch`, `torchao`, `diffusers`, and `accelerate`
Python packages. CUDA 12.4 is recommended.
+ The inference speed tests also used the above memory optimization scheme. Without memory optimization, inference speed
increases by about 10%. Only the `diffusers` version of the model supports quantization.
+ The model only supports English input; other languages can be translated into English for use via large model
refinement.
+ The memory usage of model fine-tuning is tested in an `8 * H100` environment, and the program automatically
uses `Zero 2` optimization. If a specific number of GPUs is marked in the table, that number or more GPUs must be used
for fine-tuning.
## Friendly Links
We highly welcome contributions from the community and actively contribute to the open-source community. The following
works have already been adapted for CogVideoX, and we invite everyone to use them:
+ [CogVideoX-Fun](https://github.com/aigc-apps/CogVideoX-Fun): CogVideoX-Fun is a modified pipeline based on the
CogVideoX architecture, supporting flexible resolutions and multiple launch methods.
+ [CogStudio](https://github.com/pinokiofactory/cogstudio): A separate repository for CogVideo's Gradio Web UI, which
supports more functional Web UIs.
+ [Xorbits Inference](https://github.com/xorbitsai/inference): A powerful and comprehensive distributed inference
framework, allowing you to easily deploy your own models or the latest cutting-edge open-source models with just one
click.
+ [ComfyUI-CogVideoXWrapper](https://github.com/kijai/ComfyUI-CogVideoXWrapper) Use the ComfyUI framework to integrate
CogVideoX into your workflow.
+ [VideoSys](https://github.com/NUS-HPC-AI-Lab/VideoSys): VideoSys provides a user-friendly, high-performance
infrastructure for video generation, with full pipeline support and continuous integration of the latest models and
techniques.
+ [AutoDL Space](https://www.codewithgpu.com/i/THUDM/CogVideo/CogVideoX-5b-demo): A one-click deployment Huggingface
Space image provided by community members.
+ [Interior Design Fine-Tuning Model](https://huggingface.co/collections/bertjiazheng/koolcogvideox-66e4762f53287b7f39f8f3ba):
is a fine-tuned model based on CogVideoX, specifically designed for interior design.
+ [xDiT](https://github.com/xdit-project/xDiT): xDiT is a scalable inference engine for Diffusion Transformers (DiTs)
on multiple GPU Clusters. xDiT supports real-time image and video generations services.
[cogvideox-factory](https://github.com/a-r-r-o-w/cogvideox-factory): A cost-effective
fine-tuning framework for CogVideoX, compatible with the `diffusers` version model. Supports more resolutions, and
fine-tuning CogVideoX-5B can be done with a single 4090 GPU.
+ [CogVideoX-Interpolation](https://github.com/feizc/CogvideX-Interpolation): A pipeline based on the modified CogVideoX
structure, aimed at providing greater flexibility for keyframe interpolation generation.
+ [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth Studio is a diffusion engine. It has
restructured the architecture, including text encoders, UNet, VAE, etc., enhancing computational performance while
maintaining compatibility with open-source community models. The framework has been adapted for CogVideoX.
## Project Structure
This open-source repository will guide developers to quickly get started with the basic usage and fine-tuning examples
of the **CogVideoX** open-source model.
### Quick Start with Colab
Here provide three projects that can be run directly on free Colab T4 instances:
+ [CogVideoX-5B-T2V-Colab.ipynb](https://colab.research.google.com/drive/1pCe5s0bC_xuXbBlpvIH1z0kfdTLQPzCS?usp=sharing):
CogVideoX-5B Text-to-Video Colab code.
+ [CogVideoX-5B-T2V-Int8-Colab.ipynb](https://colab.research.google.com/drive/1DUffhcjrU-uz7_cpuJO3E_D4BaJT7OPa?usp=sharing):
CogVideoX-5B Quantized Text-to-Video Inference Colab code, which takes about 30 minutes per run.
+ [CogVideoX-5B-I2V-Colab.ipynb](https://colab.research.google.com/drive/17CqYCqSwz39nZAX2YyonDxosVKUZGzcX?usp=sharing):
CogVideoX-5B Image-to-Video Colab code.
+ [CogVideoX-5B-V2V-Colab.ipynb](https://colab.research.google.com/drive/1comfGAUJnChl5NwPuO8Ox5_6WCy4kbNN?usp=sharing):
CogVideoX-5B Video-to-Video Colab code.
### Inference
+ [dcli_demo](inference/cli_demo.py): A more detailed inference code explanation, including the significance of
common parameters. All of this is covered here.
+ [cli_demo_quantization](inference/cli_demo_quantization.py):
Quantized model inference code that can run on devices with lower memory. You can also modify this code to support
running CogVideoX models in FP8 precision.
+ [diffusers_vae_demo](inference/cli_vae_demo.py): Code for running VAE inference separately.
+ [space demo](inference/gradio_composite_demo): The same GUI code as used in the Huggingface Space, with frame
interpolation and super-resolution tools integrated.
<div style="text-align: center;">
<img src="resources/web_demo.png" style="width: 100%; height: auto;" />
</div>
+ [convert_demo](inference/convert_demo.py): How to convert user input into long-form input suitable for CogVideoX.
Since CogVideoX is trained on long texts, we need to transform the input text distribution to match the training data
using an LLM. The script defaults to using GLM-4, but it can be replaced with GPT, Gemini, or any other large language
model.
+ [gradio_web_demo](inference/gradio_composite_demo): A simple Gradio web application demonstrating how to use the
CogVideoX-2B / 5B model to generate videos. Similar to our Huggingface Space, you can use this script to run a simple
web application for video generation.
### finetune
+ [finetune_demo](finetune/README.md): Fine-tuning scheme and details of the diffusers version of the CogVideoX model.
### sat
+ [sat_demo](sat/README.md): Contains the inference code and fine-tuning code of SAT weights. It is recommended to
improve based on the CogVideoX model structure. Innovative researchers use this code to better perform rapid stacking
and development.
### Tools
This folder contains some tools for model conversion / caption generation, etc.
+ [convert_weight_sat2hf](tools/convert_weight_sat2hf.py): Converts SAT model weights to Huggingface model weights.
+ [caption_demo](tools/caption/README.md): Caption tool, a model that understands videos and outputs descriptions in
text.
+ [export_sat_lora_weight](tools/export_sat_lora_weight.py): SAT fine-tuning model export tool, exports the SAT Lora
Adapter in diffusers format.
+ [load_cogvideox_lora](tools/load_cogvideox_lora.py): Tool code for loading the diffusers version of fine-tuned Lora
Adapter.
+ [llm_flux_cogvideox](tools/llm_flux_cogvideox/llm_flux_cogvideox.py): Automatically generate videos using an
open-source local large language model + Flux + CogVideoX.
+ [parallel_inference_xdit](tools/parallel_inference/parallel_inference_xdit.py):
Supported by [xDiT](https://github.com/xdit-project/xDiT), parallelize the
video generation process on multiple GPUs.
## CogVideo(ICLR'23)
The official repo for the
paper: [CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers](https://arxiv.org/abs/2205.15868)
is on the [CogVideo branch](https://github.com/THUDM/CogVideo/tree/CogVideo)
**CogVideo is able to generate relatively high-frame-rate videos.**
A 4-second clip of 32 frames is shown below.


<div align="center">
<video src="https://github.com/user-attachments/assets/2fa19651-e925-4a2a-b8d6-b3f216d490ba" width="80%" controls autoplay></video>
</div>
The demo for CogVideo is at [https://models.aminer.cn/cogvideo](https://models.aminer.cn/cogvideo/), where you can get
hands-on practice on text-to-video generation. *The original input is in Chinese.*
## Citation
🌟 If you find our work helpful, please leave us a star and cite our paper.
```
@article{yang2024cogvideox,
title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
journal={arXiv preprint arXiv:2408.06072},
year={2024}
}
@article{hong2022cogvideo,
title={CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers},
author={Hong, Wenyi and Ding, Ming and Zheng, Wendi and Liu, Xinghan and Tang, Jie},
journal={arXiv preprint arXiv:2205.15868},
year={2022}
}
```
We welcome your contributions! You can click [here](resources/contribute.md) for more information.
## License Agreement
The code in this repository is released under the [Apache 2.0 License](LICENSE).
The CogVideoX-2B model (including its corresponding Transformers module and VAE module) is released under
the [Apache 2.0 License](LICENSE).
The CogVideoX-5B model (Transformers module, include I2V and T2V) is released under
the [CogVideoX LICENSE](https://huggingface.co/THUDM/CogVideoX-5b/blob/main/LICENSE).
================================================
FILE: CogVideo/README_ja.md
================================================
# CogVideo & CogVideoX
[Read this in English](./README_zh.md)
[中文阅读](./README_zh.md)
<div align="center">
<img src=resources/logo.svg width="50%"/>
</div>
<p align="center">
<a href="https://huggingface.co/spaces/THUDM/CogVideoX-5B" target="_blank"> 🤗 Huggingface Space</a> または <a href="https://modelscope.cn/studios/ZhipuAI/CogVideoX-5b-demo" target="_blank"> 🤖 ModelScope Space</a> で CogVideoX-5B モデルをオンラインで体験してください
</p>
<p align="center">
📚 <a href="https://arxiv.org/abs/2408.06072" target="_blank">論文</a>と<a href="https://zhipu-ai.feishu.cn/wiki/DHCjw1TrJiTyeukfc9RceoSRnCh" target="_blank">使用ドキュメント</a>を表示します。
</p>
<p align="center">
👋 <a href="resources/WECHAT.md" target="_blank">WeChat</a> と <a href="https://discord.gg/dCGfUsagrD" target="_blank">Discord</a> に参加
</p>
<p align="center">
📍 <a href="https://chatglm.cn/video?lang=en?fr=osm_cogvideo">清影</a> と <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9">APIプラットフォーム</a> を訪問して、より大規模な商用ビデオ生成モデルを体験.
</p>
## 更新とニュース
- 🔥🔥 **ニュース**: ```2024/10/13```: コスト削減のため、単一の4090 GPUで`CogVideoX-5B`
を微調整できるフレームワーク [cogvideox-factory](https://github.com/a-r-r-o-w/cogvideox-factory)
がリリースされました。複数の解像度での微調整に対応しています。ぜひご利用ください!- 🔥**ニュース**: ```2024/10/10```:
技術報告書を更新し、より詳細なトレーニング情報とデモを追加しました。
- 🔥 **ニュース**: ```2024/10/10```: 技術報告書を更新しました。[こちら](https://arxiv.org/pdf/2408.06072)
をクリックしてご覧ください。さらにトレーニングの詳細とデモを追加しました。デモを見るには[こちら](https://yzy-thu.github.io/CogVideoX-demo/)
をクリックしてください。
- 🔥**ニュース**: ```2024/10/09```: 飛書の[技術ドキュメント](https://zhipu-ai.feishu.cn/wiki/DHCjw1TrJiTyeukfc9RceoSRnCh)
でCogVideoXの微調整ガイドを公開しています。分配の自由度をさらに高めるため、公開されているドキュメント内のすべての例が完全に再現可能です。
- 🔥**ニュース**: ```2024/9/19```: CogVideoXシリーズの画像生成ビデオモデル **CogVideoX-5B-I2V**
をオープンソース化しました。このモデルは、画像を背景入力として使用し、プロンプトワードと組み合わせてビデオを生成することができ、より高い制御性を提供します。これにより、CogVideoXシリーズのモデルは、テキストからビデオ生成、ビデオの継続、画像からビデオ生成の3つのタスクをサポートするようになりました。オンラインでの[体験](https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space)
をお楽しみください。
- 🔥🔥 **ニュース**: ```2024/9/19```:
CogVideoXのトレーニングプロセスでビデオデータをテキスト記述に変換するために使用されるキャプションモデル [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
をオープンソース化しました。ダウンロードしてご利用ください。
- 🔥 ```2024/8/27```: CogVideoXシリーズのより大きなモデル **CogVideoX-5B**
をオープンソース化しました。モデルの推論性能を大幅に最適化し、推論のハードルを大幅に下げました。`GTX 1080TI` などの旧型GPUで
**CogVideoX-2B** を、`RTX 3060` などのデスクトップGPUで **CogVideoX-5B**
モデルを実行できます。依存関係を更新・インストールするために、[要件](requirements.txt)
を厳守し、推論コードは [cli_demo](inference/cli_demo.py) を参照してください。さらに、**CogVideoX-2B** モデルのオープンソースライセンスが
**Apache 2.0 ライセンス** に変更されました。
- 🔥 ```2024/8/6```: **CogVideoX-2B** 用の **3D Causal VAE** をオープンソース化しました。これにより、ビデオをほぼ無損失で再構築することができます。
- 🔥 ```2024/8/6```: CogVideoXシリーズのビデオ生成モデルの最初のモデル、**CogVideoX-2B** をオープンソース化しました。
- 🌱 **ソース**: ```2022/5/19```: CogVideoビデオ生成モデルをオープンソース化しました(現在、`CogVideo`
ブランチで確認できます)。これは、トランスフォーマーに基づく初のオープンソース大規模テキスト生成ビデオモデルです。技術的な詳細については、[ICLR'23論文](https://arxiv.org/abs/2205.15868)
をご覧ください。
**より強力なモデルが、より大きなパラメータサイズで登場予定です。お楽しみに!**
## 目次
特定のセクションにジャンプ:
- [クイックスタート](#クイックスタート)
- [SAT](#sat)
- [Diffusers](#Diffusers)
- [CogVideoX-2B ギャラリー](#CogVideoX-2B-ギャラリー)
- [モデル紹介](#モデル紹介)
- [プロジェクト構造](#プロジェクト構造)
- [推論](#推論)
- [sat](#sat)
- [ツール](#ツール)
- [プロジェクト計画](#プロジェクト計画)
- [モデルライセンス](#モデルライセンス)
- [CogVideo(ICLR'23)モデル紹介](#CogVideoICLR23)
- [引用](#引用)
## クイックスタート
### プロンプトの最適化
モデルを実行する前に、[こちら](inference/convert_demo.py)
を参考にして、GLM-4(または同等の製品、例えばGPT-4)の大規模モデルを使用してどのようにモデルを最適化するかをご確認ください。これは非常に重要です。モデルは長いプロンプトでトレーニングされているため、良いプロンプトがビデオ生成の品質に直接影響を与えます。
### SAT
[sat_demo](sat/README.md) の指示に従ってください:
SATウェイトの推論コードと微調整コードが含まれています。CogVideoXモデル構造に基づいて改善することをお勧めします。革新的な研究者は、このコードを使用して迅速なスタッキングと開発を行うことができます。
### Diffusers
```
pip install -r requirements.txt
```
次に [diffusers_demo](inference/cli_demo.py) を参照してください: 推論コードの詳細な説明が含まれており、一般的なパラメータの意味についても言及しています。
量子化推論の詳細については、[diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao/) を参照してください。Diffusers
と TorchAO を使用することで、量子化推論も可能となり、メモリ効率の良い推論や、コンパイル時に場合によっては速度の向上が期待できます。A100
および H100
上でのさまざまな設定におけるメモリおよび時間のベンチマークの完全なリストは、[diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao)
に公開されています。
## Gallery
### CogVideoX-5B
<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
<tr>
<td>
<video src="https://github.com/user-attachments/assets/cf5953ea-96d3-48fd-9907-c4708752c714" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/fe0a78e6-b669-4800-8cf0-b5f9b5145b52" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/c182f606-8f8c-421d-b414-8487070fcfcb" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/7db2bbce-194d-434d-a605-350254b6c298" width="100%" controls autoplay loop></video>
</td>
</tr>
<tr>
<td>
<video src="https://github.com/user-attachments/assets/62b01046-8cab-44cc-bd45-4d965bb615ec" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/d78e552a-4b3f-4b81-ac3f-3898079554f6" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/30894f12-c741-44a2-9e6e-ddcacc231e5b" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/926575ca-7150-435b-a0ff-4900a963297b" width="100%" controls autoplay loop></video>
</td>
</tr>
</table>
### CogVideoX-2B
<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
<tr>
<td>
<video src="https://github.com/user-attachments/assets/ea3af39a-3160-4999-90ec-2f7863c5b0e9" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/9de41efd-d4d1-4095-aeda-246dd834e91d" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/941d6661-6a8d-4a1b-b912-59606f0b2841" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/938529c4-91ae-4f60-b96b-3c3947fa63cb" width="100%" controls autoplay loop></video>
</td>
</tr>
</table>
ギャラリーの対応するプロンプトワードを表示するには、[こちら](resources/galary_prompt.md)をクリックしてください
## モデル紹介
CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源のオープンソース版ビデオ生成モデルです。
以下の表に、提供しているビデオ生成モデルの基本情報を示します:
<table style="border-collapse: collapse; width: 100%;">
<tr>
<th style="text-align: center;">モデル名</th>
<th style="text-align: center;">CogVideoX-2B</th>
<th style="text-align: center;">CogVideoX-5B</th>
<th style="text-align: center;">CogVideoX-5B-I2V </th>
</tr>
<tr>
<td style="text-align: center;">推論精度</td>
<td style="text-align: center;"><b>FP16*(推奨)</b>, BF16, FP32, FP8*, INT8, INT4は非対応</td>
<td colspan="2" style="text-align: center;"><b>BF16(推奨)</b>, FP16, FP32, FP8*, INT8, INT4は非対応</td>
</tr>
<tr>
<td style="text-align: center;">単一GPUのメモリ消費<br></td>
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GBから* </b><br><b>diffusers INT8(torchao): 3.6GBから*</b></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GBから* </b><br><b>diffusers INT8(torchao): 4.4GBから* </b></td>
</tr>
<tr>
<td style="text-align: center;">マルチGPUのメモリ消費</td>
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
</tr>
<tr>
<td style="text-align: center;">推論速度<br>(ステップ = 50, FP/BF16)</td>
<td style="text-align: center;">単一A100: 約90秒<br>単一H100: 約45秒</td>
<td colspan="2" style="text-align: center;">単一A100: 約180秒<br>単一H100: 約90秒</td>
</tr>
<tr>
<td style="text-align: center;">ファインチューニング精度</td>
<td style="text-align: center;"><b>FP16</b></td>
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
</tr>
<tr>
<td style="text-align: center;">ファインチューニング時のメモリ消費</td>
<td style="text-align: center;">47 GB (bs=1, LORA)<br> 61 GB (bs=2, LORA)<br> 62GB (bs=1, SFT)</td>
<td style="text-align: center;">63 GB (bs=1, LORA)<br> 80 GB (bs=2, LORA)<br> 75GB (bs=1, SFT)<br></td>
<td style="text-align: center;">78 GB (bs=1, LORA)<br> 75GB (bs=1, SFT, 16GPU)<br></td>
</tr>
<tr>
<td style="text-align: center;">プロンプト言語</td>
<td colspan="3" style="text-align: center;">英語*</td>
</tr>
<tr>
<td style="text-align: center;">プロンプトの最大トークン数</td>
<td colspan="3" style="text-align: center;">226トークン</td>
</tr>
<tr>
<td style="text-align: center;">ビデオの長さ</td>
<td colspan="3" style="text-align: center;">6秒</td>
</tr>
<tr>
<td style="text-align: center;">フレームレート</td>
<td colspan="3" style="text-align: center;">8フレーム/秒</td>
</tr>
<tr>
<td style="text-align: center;">ビデオ解像度</td>
<td colspan="3" style="text-align: center;">720 * 480、他の解像度は非対応(ファインチューニング含む)</td>
</tr>
<tr>
<td style="text-align: center;">位置エンコーディング</td>
<td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
</tr>
<tr>
<td style="text-align: center;">ダウンロードリンク (Diffusers)</td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
</tr>
<tr>
<td style="text-align: center;">ダウンロードリンク (SAT)</td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_ja.md">SAT</a></td>
</tr>
</table>
**データ解説**
+ diffusersライブラリを使用してテストする際には、`diffusers`ライブラリが提供する全ての最適化が有効になっています。この方法は
**NVIDIA A100 / H100**以外のデバイスでのメモリ/メモリ消費のテストは行っていません。通常、この方法は**NVIDIA
Ampereアーキテクチャ**
以上の全てのデバイスに適応できます。最適化を無効にすると、メモリ消費は倍増し、ピークメモリ使用量は表の3倍になりますが、速度は約3〜4倍向上します。以下の最適化を部分的に無効にすることが可能です:
```
pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
```
+ マルチGPUで推論する場合、`enable_sequential_cpu_offload()`最適化を無効にする必要があります。
+ INT8モデルを使用すると推論速度が低下しますが、これはメモリの少ないGPUで正常に推論を行い、ビデオ品質の損失を最小限に抑えるための措置です。推論速度は大幅に低下します。
+ CogVideoX-2Bモデルは`FP16`精度でトレーニングされており、CogVideoX-5Bモデルは`BF16`
精度でトレーニングされています。推論時にはモデルがトレーニングされた精度を使用することをお勧めします。
+ [PytorchAO](https://github.com/pytorch/ao)および[Optimum-quanto](https://github.com/huggingface/optimum-quanto/)
は、CogVideoXのメモリ要件を削減するためにテキストエンコーダ、トランスフォーマ、およびVAEモジュールを量子化するために使用できます。これにより、無料のT4
Colabやより少ないメモリのGPUでモデルを実行することが可能になります。同様に重要なのは、TorchAOの量子化は`torch.compile`
と完全に互換性があり、推論速度を大幅に向上させることができる点です。`NVIDIA H100`およびそれ以上のデバイスでは`FP8`
精度を使用する必要があります。これには、`torch`、`torchao`、`diffusers`、`accelerate`
Pythonパッケージのソースコードからのインストールが必要です。`CUDA 12.4`の使用をお勧めします。
+ 推論速度テストも同様に、上記のメモリ最適化方法を使用しています。メモリ最適化を使用しない場合、推論速度は約10%向上します。
`diffusers`バージョンのモデルのみが量子化をサポートしています。
+ モデルは英語入力のみをサポートしており、他の言語は大規模モデルの改善を通じて英語に翻訳できます。
+ モデルのファインチューニングに使用されるメモリは`8 * H100`環境でテストされています。プログラムは自動的に`Zero 2`
最適化を使用しています。表に具体的なGPU数が記載されている場合、ファインチューニングにはその数以上のGPUが必要です。
## 友好的リンク
コミュニティからの貢献を大歓迎し、私たちもオープンソースコミュニティに積極的に貢献しています。以下の作品はすでにCogVideoXに対応しており、ぜひご利用ください:
+ [CogVideoX-Fun](https://github.com/aigc-apps/CogVideoX-Fun):
CogVideoX-Funは、CogVideoXアーキテクチャを基にした改良パイプラインで、自由な解像度と複数の起動方法をサポートしています。
+ [CogStudio](https://github.com/pinokiofactory/cogstudio): CogVideo の Gradio Web UI の別のリポジトリ。より高機能な Web
UI をサポートします。
+ [Xorbits Inference](https://github.com/xorbitsai/inference):
強力で包括的な分散推論フレームワークであり、ワンクリックで独自のモデルや最新のオープンソースモデルを簡単にデプロイできます。
+ [ComfyUI-CogVideoXWrapper](https://github.com/kijai/ComfyUI-CogVideoXWrapper)
ComfyUIフレームワークを使用して、CogVideoXをワークフローに統合します。
+ [VideoSys](https://github.com/NUS-HPC-AI-Lab/VideoSys): VideoSysは、使いやすく高性能なビデオ生成インフラを提供し、最新のモデルや技術を継続的に統合しています。
+ [AutoDLイメージ](https://www.codewithgpu.com/i/THUDM/CogVideo/CogVideoX-5b-demo): コミュニティメンバーが提供するHuggingface
Spaceイメージのワンクリックデプロイメント。
+ [インテリアデザイン微調整モデル](https://huggingface.co/collections/bertjiazheng/koolcogvideox-66e4762f53287b7f39f8f3ba):
は、CogVideoXを基盤にした微調整モデルで、インテリアデザイン専用に設計されています。
+ [xDiT](https://github.com/xdit-project/xDiT):
xDiTは、複数のGPUクラスター上でDiTsを並列推論するためのエンジンです。xDiTはリアルタイムの画像およびビデオ生成サービスをサポートしています。
+ [CogVideoX-Interpolation](https://github.com/feizc/CogvideX-Interpolation):
キーフレーム補間生成において、より大きな柔軟性を提供することを目的とした、CogVideoX構造を基にした修正版のパイプライン。
+ [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth
Studioは、拡散エンジンです。テキストエンコーダー、UNet、VAEなどを含むアーキテクチャを再構築し、オープンソースコミュニティモデルとの互換性を維持しつつ、計算性能を向上させました。このフレームワークはCogVideoXに適応しています。
## プロジェクト構造
このオープンソースリポジトリは、**CogVideoX** オープンソースモデルの基本的な使用方法と微調整の例を迅速に開始するためのガイドです。
### Colabでのクイックスタート
無料のColab T4上で直接実行できる3つのプロジェクトを提供しています。
+ [CogVideoX-5B-T2V-Colab.ipynb](https://colab.research.google.com/drive/1pCe5s0bC_xuXbBlpvIH1z0kfdTLQPzCS?usp=sharing):
CogVideoX-5B テキストからビデオへの生成用Colabコード。
+ [CogVideoX-5B-T2V-Int8-Colab.ipynb](https://colab.research.google.com/drive/1DUffhcjrU-uz7_cpuJO3E_D4BaJT7OPa?usp=sharing):
CogVideoX-5B テキストからビデオへの量子化推論用Colabコード。1回の実行に約30分かかります。
+ [CogVideoX-5B-I2V-Colab.ipynb](https://colab.research.google.com/drive/17CqYCqSwz39nZAX2YyonDxosVKUZGzcX?usp=sharing):
CogVideoX-5B 画像からビデオへの生成用Colabコード。
+ [CogVideoX-5B-V2V-Colab.ipynb](https://colab.research.google.com/drive/1comfGAUJnChl5NwPuO8Ox5_6WCy4kbNN?usp=sharing):
CogVideoX-5B ビデオからビデオへの生成用Colabコード。
### Inference
+ [cli_demo](inference/cli_demo.py): 推論コードの詳細な説明が含まれており、一般的なパラメータの意味についても言及しています。
+ [cli_demo_quantization](inference/cli_demo_quantization.py):
量子化モデル推論コードで、低メモリのデバイスでも実行可能です。また、このコードを変更して、FP8 精度の CogVideoX
モデルの実行をサポートすることもできます。
+ [diffusers_vae_demo](inference/cli_vae_demo.py): VAE推論コードの実行には現在71GBのメモリが必要ですが、将来的には最適化される予定です。
+ [space demo](inference/gradio_composite_demo): Huggingface Spaceと同じGUIコードで、フレーム補間や超解像ツールが組み込まれています。
<div style="text-align: center;">
<img src="resources/web_demo.png" style="width: 100%; height: auto;" />
</div>
+ [convert_demo](inference/convert_demo.py):
ユーザー入力をCogVideoXに適した形式に変換する方法。CogVideoXは長いキャプションでトレーニングされているため、入力テキストをLLMを使用してトレーニング分布と一致させる必要があります。デフォルトではGLM-4を使用しますが、GPT、Geminiなどの他のLLMに置き換えることもできます。
+ [gradio_web_demo](inference/gradio_web_demo.py): CogVideoX-2B / 5B モデルを使用して動画を生成する方法を示す、シンプルな
Gradio Web UI デモです。私たちの Huggingface Space と同様に、このスクリプトを使用して Web デモを起動することができます。
### finetune
+ [train_cogvideox_lora](finetune/README_ja.md): CogVideoX diffusers 微調整方法の詳細な説明が含まれています。このコードを使用して、自分のデータセットで
CogVideoX を微調整することができます。
### sat
+ [sat_demo](sat/README.md):
SATウェイトの推論コードと微調整コードが含まれています。CogVideoXモデル構造に基づいて改善することをお勧めします。革新的な研究者は、このコードを使用して迅速なスタッキングと開発を行うことができます。
### ツール
このフォルダには、モデル変換/キャプション生成などのツールが含まれています。
+ [convert_weight_sat2hf](tools/convert_weight_sat2hf.py): SAT モデルの重みを Huggingface モデルの重みに変換します。
+ [caption_demo](tools/caption/README_ja.md): Caption ツール、ビデオを理解してテキストで出力するモデル。
+ [export_sat_lora_weight](tools/export_sat_lora_weight.py): SAT ファインチューニングモデルのエクスポートツール、SAT Lora
Adapter を diffusers 形式でエクスポートします。
+ [load_cogvideox_lora](tools/load_cogvideox_lora.py): diffusers 版のファインチューニングされた Lora Adapter
をロードするためのツールコード。
+ [llm_flux_cogvideox](tools/llm_flux_cogvideox/llm_flux_cogvideox.py): オープンソースのローカル大規模言語モデル +
Flux + CogVideoX を使用して自動的に動画を生成します。
+ [parallel_inference_xdit](tools/parallel_inference/parallel_inference_xdit.py):
[xDiT](https://github.com/xdit-project/xDiT)
によってサポートされ、ビデオ生成プロセスを複数の GPU で並列化します。
+ [cogvideox-factory](https://github.com/a-r-r-o-w/cogvideox-factory): CogVideoXの低コスト微調整フレームワークで、
`diffusers`バージョンのモデルに適応しています。より多くの解像度に対応し、単一の4090 GPUでCogVideoX-5Bの微調整が可能です。
## CogVideo(ICLR'23)
論文の公式リポジトリ: [CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers](https://arxiv.org/abs/2205.15868)
は [CogVideo branch](https://github.com/THUDM/CogVideo/tree/CogVideo) にあります。
**CogVideoは比較的高フレームレートのビデオを生成することができます。**
32フレームの4秒間のクリップが以下に示されています。


<div align="center">
<video src="https://github.com/user-attachments/assets/2fa19651-e925-4a2a-b8d6-b3f216d490ba" width="80%" controls autoplay></video>
</div>
CogVideoのデモは [https://models.aminer.cn/cogvideo](https://models.aminer.cn/cogvideo/) で体験できます。
*元の入力は中国語です。*
## 引用
🌟 私たちの仕事が役立つと思われた場合、ぜひスターを付けていただき、論文を引用してください。
```
@article{yang2024cogvideox,
title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
journal={arXiv preprint arXiv:2408.06072},
year={2024}
}
@article{hong2022cogvideo,
title={CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers},
author={Hong, Wenyi and Ding, Ming and Zheng, Wendi and Liu, Xinghan and Tang, Jie},
journal={arXiv preprint arXiv:2205.15868},
year={2022}
}
```
あなたの貢献をお待ちしています!詳細は[こちら](resources/contribute_ja.md)をクリックしてください。
## ライセンス契約
このリポジトリのコードは [Apache 2.0 License](LICENSE) の下で公開されています。
CogVideoX-2B モデル (対応するTransformersモジュールやVAEモジュールを含む) は
[Apache 2.0 License](LICENSE) の下で公開されています。
CogVideoX-5B モデル(Transformers モジュール、画像生成ビデオとテキスト生成ビデオのバージョンを含む) は
[CogVideoX LICENSE](https://huggingface.co/THUDM/CogVideoX-5b/blob/main/LICENSE) の下で公開されています。
================================================
FILE: CogVideo/README_zh.md
================================================
# CogVideo & CogVideoX
[Read this in English](./README_zh.md)
[日本語で読む](./README_ja.md)
<div align="center">
<img src=resources/logo.svg width="50%"/>
</div>
<p align="center">
在 <a href="https://huggingface.co/spaces/THUDM/CogVideoX-5B" target="_blank"> 🤗 Huggingface Space</a> 或 <a href="https://modelscope.cn/studios/ZhipuAI/CogVideoX-5b-demo" target="_blank"> 🤖 ModelScope Space</a> 在线体验 CogVideoX-5B 模型
</p>
<p align="center">
📚 查看 <a href="https://arxiv.org/abs/2408.06072" target="_blank">论文</a> 和 <a href="https://zhipu-ai.feishu.cn/wiki/DHCjw1TrJiTyeukfc9RceoSRnCh" target="_blank">使用文档</a>
</p>
<p align="center">
👋 加入我们的 <a href="resources/WECHAT.md" target="_blank">微信</a> 和 <a href="https://discord.gg/dCGfUsagrD" target="_blank">Discord</a>
</p>
<p align="center">
📍 前往<a href="https://chatglm.cn/video?fr=osm_cogvideox"> 清影</a> 和 <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9"> API平台</a> 体验更大规模的商业版视频生成模型。
</p>
## 项目更新
- 🔥🔥 **News**: ```2024/10/13```: 成本更低,单卡4090可微调`CogVideoX-5B`
的微调框架[cogvideox-factory](https://github.com/a-r-r-o-w/cogvideox-factory)已经推出,多种分辨率微调,欢迎使用。
- 🔥 **News**: ```2024/10/10```: 我们更新了我们的技术报告,请点击 [这里](https://arxiv.org/pdf/2408.06072)
查看,附上了更多的训练细节和demo,关于demo,点击[这里](https://yzy-thu.github.io/CogVideoX-demo/) 查看。
- 🔥 **News**: ```2024/10/09```: 我们在飞书[技术文档](https://zhipu-ai.feishu.cn/wiki/DHCjw1TrJiTyeukfc9RceoSRnCh")
公开CogVideoX微调指导,以进一步增加分发自由度,公开文档中所有示例可以完全复现
- 🔥 **News**: ```2024/9/19```: 我们开源 CogVideoX 系列图生视频模型 **CogVideoX-5B-I2V**
。该模型可以将一张图像作为背景输入,结合提示词一起生成视频,具有更强的可控性。
至此,CogVideoX系列模型已经支持文本生成视频,视频续写,图片生成视频三种任务。欢迎前往在线[体验](https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space)。
- 🔥 **News**: ```2024/9/19```: CogVideoX 训练过程中用于将视频数据转换为文本描述的 Caption
模型 [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
已经开源。欢迎前往下载并使用。
- 🔥 ```2024/8/27```: 我们开源 CogVideoX 系列更大的模型 **CogVideoX-5B**
。我们大幅度优化了模型的推理性能,推理门槛大幅降低,您可以在 `GTX 1080TI` 等早期显卡运行 **CogVideoX-2B**,在 `RTX 3060`
等桌面端甜品卡运行 **CogVideoX-5B** 模型。 请严格按照[要求](requirements.txt)
更新安装依赖,推理代码请查看 [cli_demo](inference/cli_demo.py)。同时,**CogVideoX-2B** 模型开源协议已经修改为**Apache 2.0
协议**。
- 🔥 ```2024/8/6```: 我们开源 **3D Causal VAE**,用于 **CogVideoX-2B**,可以几乎无损地重构视频。
- 🔥 ```2024/8/6```: 我们开源 CogVideoX 系列视频生成模型的第一个模型, **CogVideoX-2B**。
- 🌱 **Source**: ```2022/5/19```: 我们开源了 CogVideo 视频生成模型(现在你可以在 `CogVideo` 分支中看到),这是首个开源的基于
Transformer 的大型文本生成视频模型,您可以访问 [ICLR'23 论文](https://arxiv.org/abs/2205.15868) 查看技术细节。
## 目录
跳转到指定部分:
- [快速开始](#快速开始)
- [SAT](#sat)
- [Diffusers](#Diffusers)
- [CogVideoX-2B 视频作品](#cogvideox-2b-视频作品)
- [CogVideoX模型介绍](#模型介绍)
- [完整项目代码结构](#完整项目代码结构)
- [Inference](#inference)
- [SAT](#sat)
- [Tools](#tools)
- [开源项目规划](#开源项目规划)
- [模型协议](#模型协议)
- [CogVideo(ICLR'23)模型介绍](#cogvideoiclr23)
- [引用](#引用)
## 快速开始
### 提示词优化
在开始运行模型之前,请参考 [这里](inference/convert_demo.py) 查看我们是怎么使用GLM-4(或者同级别的其他产品,例如GPT-4)
大模型对模型进行优化的,这很重要,
由于模型是在长提示词下训练的,一个好的提示词直接影响了视频生成的质量。
### SAT
查看sat文件夹下的 [sat_demo](sat/README.md):包含了 SAT 权重的推理代码和微调代码,推荐基于此代码进行 CogVideoX
模型结构的改进,研究者使用该代码可以更好的进行快速的迭代和开发。
### Diffusers
```
pip install -r requirements.txt
```
查看[diffusers_demo](inference/cli_demo.py):包含对推理代码更详细的解释,包括各种关键的参数。
欲了解更多关于量化推理的细节,请参考 [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao/)。使用 Diffusers
和 TorchAO,量化推理也是可能的,这可以实现内存高效的推理,并且在某些情况下编译后速度有所提升。有关在 A100 和 H100
上使用各种设置的内存和时间基准测试的完整列表,已发布在 [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao)
上。
## 视频作品
### CogVideoX-5B
<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
<tr>
<td>
<video src="https://github.com/user-attachments/assets/cf5953ea-96d3-48fd-9907-c4708752c714" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/fe0a78e6-b669-4800-8cf0-b5f9b5145b52" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/c182f606-8f8c-421d-b414-8487070fcfcb" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/7db2bbce-194d-434d-a605-350254b6c298" width="100%" controls autoplay loop></video>
</td>
</tr>
<tr>
<td>
<video src="https://github.com/user-attachments/assets/62b01046-8cab-44cc-bd45-4d965bb615ec" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/d78e552a-4b3f-4b81-ac3f-3898079554f6" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/30894f12-c741-44a2-9e6e-ddcacc231e5b" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/926575ca-7150-435b-a0ff-4900a963297b" width="100%" controls autoplay loop></video>
</td>
</tr>
</table>
### CogVideoX-2B
<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
<tr>
<td>
<video src="https://github.com/user-attachments/assets/ea3af39a-3160-4999-90ec-2f7863c5b0e9" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/9de41efd-d4d1-4095-aeda-246dd834e91d" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/941d6661-6a8d-4a1b-b912-59606f0b2841" width="100%" controls autoplay loop></video>
</td>
<td>
<video src="https://github.com/user-attachments/assets/938529c4-91ae-4f60-b96b-3c3947fa63cb" width="100%" controls autoplay loop></video>
</td>
</tr>
</table>
查看画廊的对应提示词,请点击[这里](resources/galary_prompt.md)
## 模型介绍
CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源版本视频生成模型。
下表展示我们提供的视频生成模型相关基础信息:
<table style="border-collapse: collapse; width: 100%;">
<tr>
<th style="text-align: center;">模型名</th>
<th style="text-align: center;">CogVideoX-2B</th>
<th style="text-align: center;">CogVideoX-5B</th>
<th style="text-align: center;">CogVideoX-5B-I2V </th>
</tr>
<tr>
<td style="text-align: center;">推理精度</td>
<td style="text-align: center;"><b>FP16*(推荐)</b>, BF16, FP32,FP8*,INT8,不支持INT4</td>
<td colspan="2" style="text-align: center;"><b>BF16(推荐)</b>, FP16, FP32,FP8*,INT8,不支持INT4</td>
</tr>
<tr>
<td style="text-align: center;">单GPU显存消耗<br></td>
<td style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> FP16: 18GB <br><b>diffusers FP16: 4GB起* </b><br><b>diffusers INT8(torchao): 3.6G起*</b></td>
<td colspan="2" style="text-align: center;"><a href="https://github.com/THUDM/SwissArmyTransformer">SAT</a> BF16: 26GB <br><b>diffusers BF16 : 5GB起* </b><br><b>diffusers INT8(torchao): 4.4G起* </b></td>
</tr>
<tr>
<td style="text-align: center;">多GPU推理显存消耗</td>
<td style="text-align: center;"><b>FP16: 10GB* using diffusers</b><br></td>
<td colspan="2" style="text-align: center;"><b>BF16: 15GB* using diffusers</b><br></td>
</tr>
<tr>
<td style="text-align: center;">推理速度<br>(Step = 50, FP/BF16)</td>
<td style="text-align: center;">单卡A100: ~90秒<br>单卡H100: ~45秒</td>
<td colspan="2" style="text-align: center;">单卡A100: ~180秒<br>单卡H100: ~90秒</td>
</tr>
<tr>
<td style="text-align: center;">微调精度</td>
<td style="text-align: center;"><b>FP16</b></td>
<td colspan="2" style="text-align: center;"><b>BF16</b></td>
</tr>
<tr>
<td style="text-align: center;">微调显存消耗</td>
<td style="text-align: center;">47 GB (bs=1, LORA)<br> 61 GB (bs=2, LORA)<br> 62GB (bs=1, SFT)</td>
<td style="text-align: center;">63 GB (bs=1, LORA)<br> 80 GB (bs=2, LORA)<br> 75GB (bs=1, SFT)<br></td>
<td style="text-align: center;">78 GB (bs=1, LORA)<br> 75GB (bs=1, SFT, 16GPU)<br></td>
</tr>
<tr>
<td style="text-align: center;">提示词语言</td>
<td colspan="3" style="text-align: center;">English*</td>
</tr>
<tr>
<td style="text-align: center;">提示词长度上限</td>
<td colspan="3" style="text-align: center;">226 Tokens</td>
</tr>
<tr>
<td style="text-align: center;">视频长度</td>
<td colspan="3" style="text-align: center;">6 秒</td>
</tr>
<tr>
<td style="text-align: center;">帧率</td>
<td colspan="3" style="text-align: center;">8 帧 / 秒 </td>
</tr>
<tr>
<td style="text-align: center;">视频分辨率</td>
<td colspan="3" style="text-align: center;">720 * 480,不支持其他分辨率(含微调)</td>
</tr>
<tr>
<td style="text-align: center;">位置编码</td>
<td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_sincos_pos_embed</td>
<td style="text-align: center;">3d_rope_pos_embed + learnable_pos_embed</td>
</tr>
<tr>
<td style="text-align: center;">下载链接 (Diffusers)</td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-2b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-2b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-2b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b">🟣 WiseModel</a></td>
<td style="text-align: center;"><a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V">🤗 HuggingFace</a><br><a href="https://modelscope.cn/models/ZhipuAI/CogVideoX-5b-I2V">🤖 ModelScope</a><br><a href="https://wisemodel.cn/models/ZhipuAI/CogVideoX-5b-I2V">🟣 WiseModel</a></td>
</tr>
<tr>
<td style="text-align: center;">下载链接 (SAT)</td>
<td colspan="3" style="text-align: center;"><a href="./sat/README_zh.md">SAT</a></td>
</tr>
</table>
**数据解释**
+ 使用 diffusers 库进行测试时,启用了全部`diffusers`库自带的优化,该方案未测试在非**NVIDIA A100 / H100**
外的设备上的实际显存 / 内存占用。通常,该方案可以适配于所有 **NVIDIA 安培架构**
以上的设备。若关闭优化,显存占用会成倍增加,峰值显存约为表格的3倍。但速度提升3-4倍左右。你可以选择性的关闭部分优化,这些优化包括:
```
pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
```
+ 多GPU推理时,需要关闭 `enable_sequential_cpu_offload()` 优化。
+ 使用 INT8 模型会导致推理速度降低,此举是为了满足显存较低的显卡能正常推理并保持较少的视频质量损失,推理速度大幅降低。
+ CogVideoX-2B 模型采用 `FP16` 精度训练, 搜有 CogVideoX-5B 模型采用 `BF16` 精度训练。我们推荐使用模型训练的精度进行推理。
+ [PytorchAO](https://github.com/pytorch/ao) 和 [Optimum-quanto](https://github.com/huggingface/optimum-quanto/)
可以用于量化文本编码器、Transformer 和 VAE 模块,以降低 CogVideoX 的内存需求。这使得在免费的 T4 Colab 或更小显存的 GPU
上运行模型成为可能!同样值得注意的是,TorchAO 量化完全兼容 `torch.compile`,这可以显著提高推理速度。在 `NVIDIA H100`
及以上设备上必须使用 `FP8` 精度,这需要源码安装 `torch`、`torchao`、`diffusers` 和 `accelerate` Python
包。建议使用 `CUDA 12.4`。
+ 推理速度测试同样采用了上述显存优化方案,不采用显存优化的情况下,推理速度提升约10%。 只有`diffusers`版本模型支持量化。
+ 模型仅支持英语输入,其他语言可以通过大模型润色时翻译为英语。
+ 模型微调所占用的显存是在 `8 * H100` 环境下进行测试,程序已经自动使用`Zero 2` 优化。表格中若有标注具体GPU数量则必须使用大于等于该数量的GPU进行微调。
## 友情链接
我们非常欢迎来自社区的贡献,并积极的贡献开源社区。以下作品已经对CogVideoX进行了适配,欢迎大家使用:
+ [CogVideoX-Fun](https://github.com/aigc-apps/CogVideoX-Fun):
CogVideoX-Fun是一个基于CogVideoX结构修改后的的pipeline,支持自由的分辨率,多种启动方式。
+ [CogStudio](https://github.com/pinokiofactory/cogstudio): CogVideo 的 Gradio Web UI单独实现仓库,支持更多功能的 Web UI。
+ [Xorbits Inference](https://github.com/xorbitsai/inference): 性能强大且功能全面的分布式推理框架,轻松一键部署你自己的模型或内置的前沿开源模型。
+ [ComfyUI-CogVideoXWrapper](https://github.com/kijai/ComfyUI-CogVideoXWrapper) 使用ComfyUI框架,将CogVideoX加入到你的工作流中。
+ [VideoSys](https://github.com/NUS-HPC-AI-Lab/VideoSys): VideoSys 提供了易用且高性能的视频生成基础设施,支持完整的管道,并持续集成最新的模型和技术。
+ [AutoDL镜像](https://www.codewithgpu.com/i/THUDM/CogVideo/CogVideoX-5b-demo): 由社区成员提供的一键部署Huggingface
Space镜像。
+ [室内设计微调模型](https://huggingface.co/collections/bertjiazheng/koolcogvideox-66e4762f53287b7f39f8f3ba) 基于
CogVideoX的微调模型,它专为室内设计而设计
+ [xDiT](https://github.com/xdit-project/xDiT): xDiT是一个用于在多GPU集群上对DiTs并行推理的引擎。xDiT支持实时图像和视频生成服务。
+ [CogVideoX-Interpolation](https://github.com/feizc/CogvideX-Interpolation): 基于 CogVideoX 结构修改的管道,旨在为关键帧插值生成提供更大的灵活性。
+ [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth 工作室是一款扩散引擎。重构了架构,包括文本编码器、UNet、VAE
等,在保持与开源社区模型兼容性的同时,提升了计算性能。该框架已经适配 CogVideoX。
## 完整项目代码结构
本开源仓库将带领开发者快速上手 **CogVideoX** 开源模型的基础调用方式、微调示例。
### Colab 快速使用
这里提供了三个能直接在免费的 Colab T4上 运行的项目
+ [CogVideoX-5B-T2V-Colab.ipynb](https://colab.research.google.com/drive/1pCe5s0bC_xuXbBlpvIH1z0kfdTLQPzCS?usp=sharing):
CogVideoX-5B 文字生成视频 Colab 代码。
+ [CogVideoX-5B-T2V-Int8-Colab.ipynb](https://colab.research.google.com/drive/1DUffhcjrU-uz7_cpuJO3E_D4BaJT7OPa?usp=sharing):
CogVideoX-5B 文字生成视频量化推理 Colab 代码,运行一次大约需要30分钟。
+ [CogVideoX-5B-I2V-Colab.ipynb](https://colab.research.google.com/drive/17CqYCqSwz39nZAX2YyonDxosVKUZGzcX?usp=sharing):
CogVideoX-5B 图片生成视频 Colab 代码。
+ [CogVideoX-5B-V2V-Colab.ipynb](https://colab.research.google.com/drive/1comfGAUJnChl5NwPuO8Ox5_6WCy4kbNN?usp=sharing):
CogVideoX-5B 视频生成视频 Colab 代码。
### inference
+ [cli_demo](inference/cli_demo.py): 更详细的推理代码讲解,常见参数的意义,在这里都会提及。
+ [cli_demo_quantization](inference/cli_demo_quantization.py):
量化模型推理代码,可以在显存较低的设备上运行,也可以基于此代码修改,以支持运行FP8等精度的CogVideoX模型。请注意,FP8
仅测试通过,且必须将 `torch-nightly`,`torchao`源代码安装,不建议在生产环境中使用。
+ [diffusers_vae_demo](inference/cli_vae_demo.py): 单独执行VAE的推理代码。
+ [space demo](inference/gradio_composite_demo): Huggingface Space同款的 GUI 代码,植入了插帧,超分工具。
<div style="text-align: center;">
<img src="resources/web_demo.png" style="width: 100%; height: auto;" />
</div>
+ [convert_demo](inference/convert_demo.py): 如何将用户的输入转换成适合
CogVideoX的长输入。因为CogVideoX是在长文本上训练的,所以我们需要把输入文本的分布通过LLM转换为和训练一致的长文本。脚本中默认使用GLM-4,也可以替换为GPT、Gemini等任意大语言模型。
+ [gradio_web_demo](inference/gradio_composite_demo/app.py): 与 Huggingface Space 完全相同的代码实现,快速部署 CogVideoX
GUI体验。
### finetune
+ [train_cogvideox_lora](finetune/README_zh.md): diffusers版本 CogVideoX 模型微调方案和细节。
### sat
+ [sat_demo](sat/README_zh.md): 包含了 SAT 权重的推理代码和微调代码,推荐基于 CogVideoX
模型结构进行改进,创新的研究者使用改代码以更好的进行快速的堆叠和开发。
### tools
本文件夹包含了一些工具,用于模型的转换 / Caption 等工作。
+ [convert_weight_sat2hf](tools/convert_weight_sat2hf.py): 将 SAT 模型权重转换为 Huggingface 模型权重。
+ [caption_demo](tools/caption/README_zh.md): Caption 工具,对视频理解并用文字输出的模型。
+ [export_sat_lora_weight](tools/export_sat_lora_weight.py): SAT微调模型导出工具,将
SAT Lora Adapter 导出为 diffusers 格式。
+ [load_cogvideox_lora](tools/load_cogvideox_lora.py): 载入diffusers版微调Lora Adapter的工具代码。
+ [llm_flux_cogvideox](tools/llm_flux_cogvideox/llm_flux_cogvideox.py): 使用开源本地大语言模型 + Flux +
CogVideoX实现自动化生成视频。
+ [parallel_inference_xdit](tools/parallel_inference/parallel_inference_xdit.py):
在多个 GPU 上并行化视频生成过程,
由[xDiT](https://github.com/xdit-project/xDiT)提供支持。
+ [cogvideox-factory](https://github.com/a-r-r-o-w/cogvideox-factory): CogVideoX低成文微调框架,适配`diffusers`
版本模型。支持更多分辨率,单卡4090即可微调 CogVideoX-5B 。
## CogVideo(ICLR'23)
[CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers](https://arxiv.org/abs/2205.15868)
的官方repo位于[CogVideo branch](https://github.com/THUDM/CogVideo/tree/CogVideo)。
**CogVideo可以生成高帧率视频,下面展示了一个32帧的4秒视频。**


<div align="center">
<video src="https://github.com/user-attachments/assets/ea3af39a-3160-4999-90ec-2f7863c5b0e9" width="80%" controls autoplay></video>
</div>
CogVideo的demo网站在[https://models.aminer.cn/cogvideo](https://models.aminer.cn/cogvideo/)。您可以在这里体验文本到视频生成。
*原始输入为中文。*
## 引用
🌟 如果您发现我们的工作有所帮助,欢迎引用我们的文章,留下宝贵的stars
```
@article{yang2024cogvideox,
title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
journal={arXiv preprint arXiv:2408.06072},
year={2024}
}
@article{hong2022cogvideo,
title={CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers},
author={Hong, Wenyi and Ding, Ming and Zheng, Wendi and Liu, Xinghan and Tang, Jie},
journal={arXiv preprint arXiv:2205.15868},
year={2022}
}
```
我们欢迎您的贡献,您可以点击[这里](resources/contribute_zh.md)查看更多信息。
## 模型协议
本仓库代码使用 [Apache 2.0 协议](LICENSE) 发布。
CogVideoX-2B 模型 (包括其对应的Transformers模块,VAE模块) 根据 [Apache 2.0 协议](LICENSE) 许可证发布。
CogVideoX-5B 模型 (Transformers 模块,包括图生视频,文生视频版本)
根据 [CogVideoX LICENSE](https://huggingface.co/THUDM/CogVideoX-5b/blob/main/LICENSE)
许可证发布。
================================================
FILE: CogVideo/download.sh
================================================
mkdir CogVideoX-2b-sat
cd CogVideoX-2b-sat
wget https://cloud.tsinghua.edu.cn/f/fdba7608a49c463ba754/?dl=1
mv 'index.html?dl=1' vae.zip
unzip vae.zip
wget https://cloud.tsinghua.edu.cn/f/556a3e1329e74f1bac45/?dl=1
mv 'index.html?dl=1' transformer.zip
unzip transformer.zip
================================================
FILE: CogVideo/finetune/README.md
================================================
# CogVideoX diffusers Fine-tuning Guide
[中文阅读](./README_zh.md)
[日本語で読む](./README_ja.md)
This feature is not fully complete yet. If you want to check the fine-tuning for the SAT version, please
see [here](../sat/README_zh.md). The dataset format is different from this version.
## Hardware Requirements
+ CogVideoX-2B / 5B LoRA: 1 * A100 (5B need to use `--use_8bit_adam`)
+ CogVideoX-2B SFT: 8 * A100 (Working)
+ CogVideoX-5B-I2V is not supported yet.
## Install Dependencies
Since the related code has not been merged into the diffusers release, you need to base your fine-tuning on the
diffusers branch. Please follow the steps below to install dependencies:
```shell
git clone https://github.com/huggingface/diffusers.git
cd diffusers # Now in Main branch
pip install -e .
```
## Prepare the Dataset
First, you need to prepare the dataset. The dataset format should be as follows, with `videos.txt` containing the list
of videos in the `videos` directory:
```
.
├── prompts.txt
├── videos
└── videos.txt
```
You can download
the [Disney Steamboat Willie](https://huggingface.co/datasets/Wild-Heart/Disney-VideoGeneration-Dataset) dataset from
here.
This video fine-tuning dataset is used as a test for fine-tuning.
## Configuration Files and Execution
The `accelerate` configuration files are as follows:
+ `accelerate_config_machine_multi.yaml`: Suitable for multi-GPU use
+ `accelerate_config_machine_single.yaml`: Suitable for single-GPU use
The configuration for the `finetune` script is as follows:
```
accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \ # Use accelerate to launch multi-GPU training with the config file accelerate_config_machine_single.yaml
train_cogvideox_lora.py \ # Training script train_cogvideox_lora.py for LoRA fine-tuning on CogVideoX model
--gradient_checkpointing \ # Enable gradient checkpointing to reduce memory usage
--pretrained_model_name_or_path $MODEL_PATH \ # Path to the pretrained model, specified by $MODEL_PATH
--cache_dir $CACHE_PATH \ # Cache directory for model files, specified by $CACHE_PATH
--enable_tiling \ # Enable tiling technique to process videos in chunks, saving memory
--enable_slicing \ # Enable slicing to further optimize memory by slicing inputs
--instance_data_root $DATASET_PATH \ # Dataset path specified by $DATASET_PATH
--caption_column prompts.txt \ # Specify the file prompts.txt for video descriptions used in training
--video_column videos.txt \ # Specify the file videos.txt for video paths used in training
--validation_prompt "" \ # Prompt used for generating validation videos during training
--validation_prompt_separator ::: \ # Set ::: as the separator for validation prompts
--num_validation_videos 1 \ # Generate 1 validation video per validation round
--validation_epochs 100 \ # Perform validation every 100 training epochs
--seed 42 \ # Set random seed to 42 for reproducibility
--rank 128 \ # Set the rank for LoRA parameters to 128
--lora_alpha 64 \ # Set the alpha parameter for LoRA to 64, adjusting LoRA learning rate
--mixed_precision bf16 \ # Use bf16 mixed precision for training to save memory
--output_dir $OUTPUT_PATH \ # Specify the output directory for the model, defined by $OUTPUT_PATH
--height 480 \ # Set video height to 480 pixels
--width 720 \ # Set video width to 720 pixels
--fps 8 \ # Set video frame rate to 8 frames per second
--max_num_frames 49 \ # Set the maximum number of frames per video to 49
--skip_frames_start 0 \ # Skip 0 frames at the start of the video
--skip_frames_end 0 \ # Skip 0 frames at the end of the video
--train_batch_size 4 \ # Set training batch size to 4
--num_train_epochs 30 \ # Total number of training epochs set to 30
--checkpointing_steps 1000 \ # Save model checkpoint every 1000 steps
--gradient_accumulation_steps 1 \ # Accumulate gradients for 1 step, updating after each batch
--learning_rate 1e-3 \ # Set learning rate to 0.001
--lr_scheduler cosine_with_restarts \ # Use cosine learning rate scheduler with restarts
--lr_warmup_steps 200 \ # Warm up the learning rate for the first 200 steps
--lr_num_cycles 1 \ # Set the number of learning rate cycles to 1
--optimizer AdamW \ # Use the AdamW optimizer
--adam_beta1 0.9 \ # Set Adam optimizer beta1 parameter to 0.9
--adam_beta2 0.95 \ # Set Adam optimizer beta2 parameter to 0.95
--max_grad_norm 1.0 \ # Set maximum gradient clipping value to 1.0
--allow_tf32 \ # Enable TF32 to speed up training
--report_to wandb # Use Weights and Biases (wandb) for logging and monitoring the training
```
## Running the Script to Start Fine-tuning
Single Node (One GPU or Multi GPU) fine-tuning:
```shell
bash finetune_single_rank.sh
```
Multi-Node fine-tuning:
```shell
bash finetune_multi_rank.sh # Needs to be run on each node
```
## Loading the Fine-tuned Model
+ Please refer to [cli_demo.py](../inference/cli_demo.py) for how to load the fine-tuned model.
## Best Practices
+ Includes 70 training videos with a resolution of `200 x 480 x 720` (frames x height x width). By skipping frames in
the data preprocessing, we created two smaller datasets with 49 and 16 frames to speed up experimentation, as the
maximum frame limit recommended by the CogVideoX team is 49 frames. We split the 70 videos into three groups of 10,
25, and 50 videos, with similar conceptual nature.
+ Using 25 or more videos works best when training new concepts and styles.
+ It works better to train using identifier tokens specified with `--id_token`. This is similar to Dreambooth training,
but regular fine-tuning without such tokens also works.
+ The original repository used `lora_alpha` set to 1. We found this value ineffective across multiple runs, likely due
to differences in the backend and training setup. Our recommendation is to set `lora_alpha` equal to rank or rank //
2.
+ We recommend using a rank of 64 or higher.
================================================
FILE: CogVideo/finetune/README_ja.md
================================================
# CogVideoX diffusers 微調整方法
[Read this in English.](./README_zh)
[中文阅读](./README_zh.md)
この機能はまだ完全に完成していません。SATバージョンの微調整を確認したい場合は、[こちら](../sat/README_ja.md)を参照してください。本バージョンとは異なるデータセット形式を使用しています。
## ハードウェア要件
+ CogVideoX-2B / 5B T2V LORA: 1 * A100 (5B need to use `--use_8bit_adam`)
+ CogVideoX-2B SFT: 8 * A100 (動作確認済み)
+ CogVideoX-5B-I2V まだサポートしていません
## 依存関係のインストール
関連コードはまだdiffusersのリリース版に統合されていないため、diffusersブランチを使用して微調整を行う必要があります。以下の手順に従って依存関係をインストールしてください:
```shell
git clone https://github.com/huggingface/diffusers.git
cd diffusers # Now in Main branch
pip install -e .
```
## データセットの準備
まず、データセットを準備する必要があります。データセットの形式は以下のようになります。
```
.
├── prompts.txt
├── videos
└── videos.txt
```
[ディズニースチームボートウィリー](https://huggingface.co/datasets/Wild-Heart/Disney-VideoGeneration-Dataset)をここからダウンロードできます。
ビデオ微調整データセットはテスト用として使用されます。
## 設定ファイルと実行
`accelerate` 設定ファイルは以下の通りです:
+ accelerate_config_machine_multi.yaml 複数GPU向け
+ accelerate_config_machine_single.yaml 単一GPU向け
`finetune` スクリプト設定ファイルの例:
```
accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \ # accelerateを使用してmulti-GPUトレーニングを起動、設定ファイルはaccelerate_config_machine_single.yaml
train_cogvideox_lora.py \ # LoRAの微調整用のトレーニングスクリプトtrain_cogvideox_lora.pyを実行
--gradient_checkpointing \ # メモリ使用量を減らすためにgradient checkpointingを有効化
--pretrained_model_name_or_path $MODEL_PATH \ # 事前学習済みモデルのパスを$MODEL_PATHで指定
--cache_dir $CACHE_PATH \ # モデルファイルのキャッシュディレクトリを$CACHE_PATHで指定
--enable_tiling \ # メモリ節約のためにタイル処理を有効化し、動画をチャンク分けして処理
--enable_slicing \ # 入力をスライスしてさらにメモリ最適化
--instance_data_root $DATASET_PATH \ # データセットのパスを$DATASET_PATHで指定
--caption_column prompts.txt \ # トレーニングで使用する動画の説明ファイルをprompts.txtで指定
--video_column videos.txt \ # トレーニングで使用する動画のパスファイルをvideos.txtで指定
--validation_prompt "" \ # トレーニング中に検証用の動画を生成する際のプロンプト
--validation_prompt_separator ::: \ # 検証プロンプトの区切り文字を:::に設定
--num_validation_videos 1 \ # 各検証ラウンドで1本の動画を生成
--validation_epochs 100 \ # 100エポックごとに検証を実施
--seed 42 \ # 再現性を保証するためにランダムシードを42に設定
--rank 128 \ # LoRAのパラメータのランクを128に設定
--lora_alpha 64 \ # LoRAのalphaパラメータを64に設定し、LoRAの学習率を調整
--mixed_precision bf16 \ # bf16混合精度でトレーニングし、メモリを節約
--output_dir $OUTPUT_PATH \ # モデルの出力ディレクトリを$OUTPUT_PATHで指定
--height 480 \ # 動画の高さを480ピクセルに設定
--width 720 \ # 動画の幅を720ピクセルに設定
--fps 8 \ # 動画のフレームレートを1秒あたり8フレームに設定
--max_num_frames 49 \ # 各動画の最大フレーム数を49に設定
--skip_frames_start 0 \ # 動画の最初のフレームを0スキップ
--skip_frames_end 0 \ # 動画の最後のフレームを0スキップ
--train_batch_size 4 \ # トレーニングのバッチサイズを4に設定
--num_train_epochs 30 \ # 総トレーニングエポック数を30に設定
--checkpointing_steps 1000 \ # 1000ステップごとにモデルのチェックポイントを保存
--gradient_accumulation_steps 1 \ # 1ステップの勾配累積を行い、各バッチ後に更新
--learning_rate 1e-3 \ # 学習率を0.001に設定
--lr_scheduler cosine_with_restarts \ # リスタート付きのコサイン学習率スケジューラを使用
--lr_warmup_steps 200 \ # トレーニングの最初の200ステップで学習率をウォームアップ
--lr_num_cycles 1 \ # 学習率のサイクル数を1に設定
--optimizer AdamW \ # AdamWオプティマイザーを使用
--adam_beta1 0.9 \ # Adamオプティマイザーのbeta1パラメータを0.9に設定
--adam_beta2 0.95 \ # Adamオプティマイザーのbeta2パラメータを0.95に設定
--max_grad_norm 1.0 \ # 勾配クリッピングの最大値を1.0に設定
--allow_tf32 \ # トレーニングを高速化するためにTF32を有効化
--report_to wandb # Weights and Biasesを使用してトレーニングの記録とモニタリングを行う
```
## 微調整を開始
単一マシン (シングルGPU、マルチGPU) での微調整:
```shell
bash finetune_single_rank.sh
```
複数マシン・マルチGPUでの微調整:
```shell
bash finetune_multi_rank.sh # 各ノードで実行する必要があります。
```
## 微調整済みモデルのロード
+ 微調整済みのモデルをロードする方法については、[cli_demo.py](../inference/cli_demo.py) を参照してください。
## ベストプラクティス
+ 解像度が `200 x 480 x 720`(フレーム数 x 高さ x 幅)のトレーニングビデオが70本含まれています。データ前処理でフレームをスキップすることで、49フレームと16フレームの小さなデータセットを作成しました。これは実験を加速するためのもので、CogVideoXチームが推奨する最大フレーム数制限は49フレームです。
+ 25本以上のビデオが新しい概念やスタイルのトレーニングに最適です。
+ 現在、`--id_token` を指定して識別トークンを使用してトレーニングする方が効果的です。これはDreamboothトレーニングに似ていますが、通常の微調整でも機能します。
+ 元のリポジトリでは `lora_alpha` を1に設定していましたが、複数の実行でこの値が効果的でないことがわかりました。モデルのバックエンドやトレーニング設定によるかもしれません。私たちの提案は、lora_alphaをrankと同じか、rank // 2に設定することです。
+ Rank 64以上の設定を推奨します。
================================================
FILE: CogVideo/finetune/README_zh.md
================================================
# CogVideoX diffusers 微调方案
[Read this in English](./README_zh.md)
[日本語で読む](./README_ja.md)
本功能尚未完全完善,如果您想查看SAT版本微调,请查看[这里](../sat/README_zh.md)。其数据集格式与本版本不同。
## 硬件要求
+ CogVideoX-2B / 5B T2V LORA: 1 * A100 (5B need to use `--use_8bit_adam`)
+ CogVideoX-2B SFT: 8 * A100 (制作中)
+ CogVideoX-5B-I2V 暂未支持
## 安装依赖
由于相关代码还没有被合并到diffusers发行版,你需要基于diffusers分支进行微调。请按照以下步骤安装依赖:
```shell
git clone https://github.com/huggingface/diffusers.git
cd diffusers # Now in Main branch
pip install -e .
```
## 准备数据集
首先,你需要准备数据集,数据集格式如下,其中,videos.txt 存放 videos 中的视频。
```
.
├── prompts.txt
├── videos
└── videos.txt
```
你可以从这里下载 [迪士尼汽船威利号](https://huggingface.co/datasets/Wild-Heart/Disney-VideoGeneration-Dataset)
视频微调数据集作为测试微调。
## 配置文件和运行
`accelerate` 配置文件如下:
+ accelerate_config_machine_multi.yaml 适合多GPU使用
+ accelerate_config_machine_single.yaml 适合单GPU使用
`finetune` 脚本配置文件如下:
```shell
accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \ # 使用 accelerate 启动多GPU训练,配置文件为 accelerate_config_machine_single.yaml
train_cogvideox_lora.py \ # 运行的训练脚本为 train_cogvideox_lora.py,用于在 CogVideoX 模型上进行 LoRA 微调
--gradient_checkpointing \ # 启用梯度检查点功能,以减少显存使用
--pretrained_model_name_or_path $MODEL_PATH \ # 预训练模型路径,通过 $MODEL_PATH 指定
--cache_dir $CACHE_PATH \ # 模型缓存路径,由 $CACHE_PATH 指定
--enable_tiling \ # 启用tiling技术,以分片处理视频,节省显存
--enable_slicing \ # 启用slicing技术,将输入切片,以进一步优化内存
--instance_data_root $DATASET_PATH \ # 数据集路径,由 $DATASET_PATH 指定
--caption_column prompts.txt \ # 指定用于训练的视频描述文件,文件名为 prompts.txt
--video_column videos.txt \ # 指定用于训练的视频路径文件,文件名为 videos.txt
--validation_prompt "" \ # 验证集的提示语 (prompt),用于在训练期间生成验证视频
--validation_prompt_separator ::: \ # 设置验证提示语的分隔符为 :::
--num_validation_videos 1 \ # 每个验证回合生成 1 个视频
--validation_epochs 100 \ # 每 100 个训练epoch进行一次验证
--seed 42 \ # 设置随机种子为 42,以保证结果的可复现性
--rank 128 \ # 设置 LoRA 参数的秩 (rank) 为 128
--lora_alpha 64 \ # 设置 LoRA 的 alpha 参数为 64,用于调整LoRA的学习率
--mixed_precision bf16 \ # 使用 bf16 混合精度进行训练,减少显存使用
--output_dir $OUTPUT_PATH \ # 指定模型输出目录,由 $OUTPUT_PATH 定义
--height 480 \ # 视频高度为 480 像素
--width 720 \ # 视频宽度为 720 像素
--fps 8 \ # 视频帧率设置为 8 帧每秒
--max_num_frames 49 \ # 每个视频的最大帧数为 49 帧
--skip_frames_start 0 \ # 跳过视频开头的帧数为 0
--skip_frames_end 0 \ # 跳过视频结尾的帧数为 0
--train_batch_size 4 \ # 训练时的 batch size 设置为 4
--num_train_epochs 30 \ # 总训练epoch数为 30
--checkpointing_steps 1000 \ # 每 1000 步保存一次模型检查点
--gradient_accumulation_steps 1 \ # 梯度累计步数为 1,即每个 batch 后都会更新梯度
--learning_rate 1e-3 \ # 学习率设置为 0.001
--lr_scheduler cosine_with_restarts \ # 使用带重启的余弦学习率调度器
--lr_warmup_steps 200 \ # 在训练的前 200 步进行学习率预热
--lr_num_cycles 1 \ # 学习率周期设置为 1
--optimizer AdamW \ # 使用 AdamW 优化器
--adam_beta1 0.9 \ # 设置 Adam 优化器的 beta1 参数为 0.9
--adam_beta2 0.95 \ # 设置 Adam 优化器的 beta2 参数为 0.95
--max_grad_norm 1.0 \ # 最大梯度裁剪值设置为 1.0
--allow_tf32 \ # 启用 TF32 以加速训练
--report_to wandb # 使用 Weights and Biases 进行训练记录与监控
```
## 运行脚本,开始微调
单机(单卡,多卡)微调:
```shell
bash finetune_single_rank.sh
```
多机多卡微调:
```shell
bash finetune_multi_rank.sh #需要在每个节点运行
```
## 载入微调的模型
+ 请关注[cli_demo.py](../inference/cli_demo.py) 以了解如何加载微调的模型。
## 最佳实践
+ 包含70个分辨率为 `200 x 480 x 720`(帧数 x 高 x
宽)的训练视频。通过数据预处理中的帧跳过,我们创建了两个较小的49帧和16帧数据集,以加快实验速度,因为CogVideoX团队建议的最大帧数限制是49帧。我们将70个视频分成三组,分别为10、25和50个视频。这些视频的概念性质相似。
+ 25个及以上的视频在训练新概念和风格时效果最佳。
+ 现使用可以通过 `--id_token` 指定的标识符token进行训练效果更好。这类似于 Dreambooth 训练,但不使用这种token的常规微调也可以工作。
+ 原始仓库使用 `lora_alpha` 设置为 1。我们发现这个值在多次运行中效果不佳,可能是因为模型后端和训练设置的不同。我们的建议是将
lora_alpha 设置为与 rank 相同或 rank // 2。
+ 建议使用 rank 为 64 及以上的设置。
================================================
FILE: CogVideo/finetune/accelerate_config_machine_single.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
gradient_accumulation_steps: 1
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
dynamo_backend: 'no'
mixed_precision: 'no'
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
================================================
FILE: CogVideo/finetune/accelerate_config_machine_single_debug.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
gradient_accumulation_steps: 1
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
dynamo_backend: 'no'
mixed_precision: 'no'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
================================================
FILE: CogVideo/finetune/finetune_single_rank_injector.sh
================================================
#!/bin/bash
export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b" # Change it to CogVideoX-5B path
export TRANSFORMER_PATH="" # Resume from pretrained injector checkpoint
export LORA_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/lora" # Change it to pretrained lora path
export CACHE_PATH="~/.cache"
export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset" # Change it to 360-Motion Dataset path
export OUTPUT_PATH="injector"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,"
# if you are not using wth 8 gus, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number
accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \
train_cogvideox_injector.py \
--gradient_checkpointing \
--pretrained_model_name_or_path $MODEL_PATH \
--lora_path $LORA_PATH \
--cache_dir $CACHE_PATH \
--enable_tiling \
--enable_slicing \
--finetune_init \
--instance_data_root $DATASET_PATH \
--validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \
--validation_prompt_separator ::: \
--num_validation_videos 1 \
--validation_epochs 1 \
--block_interval 2 \
--seed 42 \
--lora_scale 1.0 \
--mixed_precision bf16 \
--output_dir $OUTPUT_PATH \
--height 480 \
--width 720 \
--fps 8 \
--max_num_frames 49 \
--skip_frames_start 0 \
--skip_frames_end 0 \
--train_batch_size 1 \
--num_train_epochs 1000 \
--checkpointing_steps 4000 \
--gradient_accumulation_steps 1 \
--learning_rate 1e-4 \
--lr_scheduler cosine_with_restarts \
--lr_warmup_steps 200 \
--lr_num_cycles 1 \
--enable_slicing \
--enable_tiling \
--gradient_checkpointing \
--optimizer AdamW \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--max_grad_norm 1.0 \
--allow_tf32 \
--report_to wandb
# --resume_from_checkpoint $TRANSFORMER_PATH \
================================================
FILE: CogVideo/finetune/finetune_single_rank_lora.sh
================================================
#!/bin/bash
export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b" # Change it to CogVideoX-5B path
export CACHE_PATH="~/.cache"
export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset" # Change it to 360-Motion Dataset path
export OUTPUT_PATH="lora"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,"
# if you are not using wth 1 gpu, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number
accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \
train_cogvideox_lora.py \
--gradient_checkpointing \
--pretrained_model_name_or_path $MODEL_PATH \
--cache_dir $CACHE_PATH \
--enable_tiling \
--enable_slicing \
--instance_data_root $DATASET_PATH \
--validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \
--validation_prompt_separator ::: \
--num_validation_videos 1 \
--validation_epochs 1 \
--seed 42 \
--rank 32 \
--lora_alpha 32 \
--mixed_precision bf16 \
--output_dir $OUTPUT_PATH \
--height 480 \
--width 720 \
--fps 8 \
--max_num_frames 49 \
--skip_frames_start 0 \
--skip_frames_end 0 \
--train_batch_size 2 \
--num_train_epochs 1000 \
--checkpointing_steps 1000 \
--gradient_accumulation_steps 1 \
--learning_rate 3e-4 \
--lr_scheduler cosine_with_restarts \
--lr_warmup_steps 200 \
--lr_num_cycles 1 \
--enable_slicing \
--enable_tiling \
--gradient_checkpointing \
--optimizer AdamW \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--max_grad_norm 1.0 \
--allow_tf32 \
--report_to wandb
================================================
FILE: CogVideo/finetune/hostfile.txt
================================================
node1 slots=8
node2 slots=8
================================================
FILE: CogVideo/finetune/models/attention.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Tuple
import torch
import torch.nn.functional as F
from torch import nn
from diffusers.utils import deprecate, logging
from diffusers.utils.torch_utils import maybe_allow_in_graph
from diffusers.models.activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, SwiGLU
from models.attention_processor import Attention, JointAttnProcessor2_0
from diffusers.models.embeddings import SinusoidalPositionalEmbedding
from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm, SD35AdaLayerNormZeroX
logger = logging.get_logger(__name__)
def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
# "feed_forward_chunk_size" can be used to save memory
if hidden_states.shape[chunk_dim] % chunk_size != 0:
raise ValueError(
f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
)
num_chunks = hidden_states.shape[chunk_dim] // chunk_size
ff_output = torch.cat(
[ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
dim=chunk_dim,
)
return ff_output
@maybe_allow_in_graph
class GatedSelfAttentionDense(nn.Module):
r"""
A gated self-attention dense layer that combines visual features and object features.
Parameters:
query_dim (`int`): The number of channels in the query.
context_dim (`int`): The number of channels in the context.
n_heads (`int`): The number of heads to use for attention.
d_head (`int`): The number of channels in each head.
"""
def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
super().__init__()
# we need a linear projection since we need cat visual feature and obj feature
self.linear = nn.Linear(context_dim, query_dim)
self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
self.ff = FeedForward(query_dim, activation_fn="geglu")
self.norm1 = nn.LayerNorm(query_dim)
self.norm2 = nn.LayerNorm(query_dim)
self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
self.enabled = True
def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
if not self.enabled:
return x
n_visual = x.shape[1]
objs = self.linear(objs)
x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
return x
@maybe_allow_in_graph
class JointTransformerBlock(nn.Module):
r"""
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
Reference: https://arxiv.org/abs/2403.03206
Parameters:
dim (`int`): The number of channels in the input and output.
num_attention_heads (`int`): The number of heads to use for multi-head attention.
attention_head_dim (`int`): The number of channels in each head.
context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
processing of `context` conditions.
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
context_pre_only: bool = False,
qk_norm: Optional[str] = None,
use_dual_attention: bool = False,
):
super().__init__()
self.use_dual_attention = use_dual_attention
self.context_pre_only = context_pre_only
context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero"
if use_dual_attention:
self.norm1 = SD35AdaLayerNormZeroX(dim)
else:
self.norm1 = AdaLayerNormZero(dim)
if context_norm_type == "ada_norm_continous":
self.norm1_context = AdaLayerNormContinuous(
dim, dim, elementwise_affine=False, eps=1e-6, bias=True, norm_type="layer_norm"
)
elif context_norm_type == "ada_norm_zero":
self.norm1_context = AdaLayerNormZero(dim)
else:
raise ValueError(
f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`"
)
if hasattr(F, "scaled_dot_product_attention"):
processor = JointAttnProcessor2_0()
else:
raise ValueError(
"The current PyTorch version does not support the `scaled_dot_product_attention` function."
)
self.attn = Attention(
query_dim=dim,
cross_attention_dim=None,
added_kv_proj_dim=dim,
dim_head=attention_head_dim,
heads=num_attention_heads,
out_dim=dim,
context_pre_only=context_pre_only,
bias=True,
processor=processor,
qk_norm=qk_norm,
eps=1e-6,
)
if use_dual_attention:
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=None,
dim_head=attention_head_dim,
heads=num_attention_heads,
out_dim=dim,
bias=True,
processor=processor,
qk_norm=qk_norm,
eps=1e-6,
)
else:
self.attn2 = None
self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
if not context_pre_only:
self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
else:
self.norm2_context = None
self.ff_context = None
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = 0
# Copied from diffusers.models.attention.BasicTransformerBlock.set_chunk_feed_forward
def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
# Sets chunk feed-forward
self._chunk_size = chunk_size
self._chunk_dim = dim
def forward(
self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
):
if self.use_dual_attention:
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
hidden_states, emb=temb
)
else:
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
if self.context_pre_only:
norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
else:
norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
encoder_hidden_states, emb=temb
)
# Attention.
attn_output, context_attn_output = self.attn(
hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
)
# Process attention outputs for the `hidden_states`.
attn_output = gate_msa.unsqueeze(1) * attn_output
hidden_states = hidden_states + attn_output
if self.use_dual_attention:
attn_output2 = self.attn2(hidden_states=norm_hidden_states2)
attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
hidden_states = hidden_states + attn_output2
norm_hidden_states = self.norm2(hidden_states)
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
if self._chunk_size is not None:
# "feed_forward_chunk_size" can be used to save memory
ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
else:
ff_output = self.ff(norm_hidden_states)
ff_output = gate_mlp.unsqueeze(1) * ff_output
hidden_states = hidden_states + ff_output
# Process attention outputs for the `encoder_hidden_states`.
if self.context_pre_only:
encoder_hidden_states = None
else:
context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
encoder_hidden_states = encoder_hidden_states + context_attn_output
norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
if self._chunk_size is not None:
# "feed_forward_chunk_size" can be used to save memory
context_ff_output = _chunked_feed_forward(
self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
)
else:
context_ff_output = self.ff_context(norm_encoder_hidden_states)
encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
return encoder_hidden_states, hidden_states
@maybe_allow_in_graph
class BasicTransformerBlock(nn.Module):
r"""
A basic Transformer block.
Parameters:
dim (`int`): The number of channels in the input and output.
num_attention_heads (`int`): The number of heads to use for multi-head attention.
attention_head_dim (`int`): The number of channels in each head.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
num_embeds_ada_norm (:
obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
attention_bias (:
obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
only_cross_attention (`bool`, *optional*):
Whether to use only cross-attention layers. In this case two cross attention layers are used.
double_self_attention (`bool`, *optional*):
Whether to use two self-attention layers. In this case no cross attention layers are used.
upcast_attention (`bool`, *optional*):
Whether to upcast the attention computation to float32. This is useful for mixed precision training.
norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
Whether to use learnable elementwise affine parameters for normalization.
norm_type (`str`, *optional*, defaults to `"layer_norm"`):
The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
final_dropout (`bool` *optional*, defaults to False):
Whether to apply a final dropout after the last feed-forward layer.
attention_type (`str`, *optional*, defaults to `"default"`):
The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
positional_embeddings (`str`, *optional*, defaults to `None`):
The type of positional embeddings to apply to.
num_positional_embeddings (`int`, *optional*, defaults to `None`):
The maximum number of positional embeddings to apply.
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
dropout=0.0,
cross_attention_dim: Optional[int] = None,
activation_fn: str = "geglu",
num_embeds_ada_norm: Optional[int] = None,
attention_bias: bool = False,
only_cross_attention: bool = False,
double_self_attention: bool = False,
upcast_attention: bool = False,
norm_elementwise_affine: bool = True,
norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
norm_eps: float = 1e-5,
final_dropout: bool = False,
attention_type: str = "default",
positional_embeddings: Optional[str] = None,
num_positional_embeddings: Optional[int] = None,
ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
ada_norm_bias: Optional[int] = None,
ff_inner_dim: Optional[int] = None,
ff_bias: bool = True,
attention_out_bias: bool = True,
):
super().__init__()
self.dim = dim
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
self.dropout = dropout
self.cross_attention_dim = cross_attention_dim
self.activation_fn = activation_fn
self.attention_bias = attention_bias
self.double_self_attention = double_self_attention
self.norm_elementwise_affine = norm_elementwise_affine
self.positional_embeddings = positional_embeddings
self.num_positional_embeddings = num_positional_embeddings
self.only_cross_attention = only_cross_attention
# We keep these boolean flags for backward-compatibility.
self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
self.use_layer_norm = norm_type == "layer_norm"
self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
raise ValueError(
f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
)
self.norm_type = norm_type
self.num_embeds_ada_norm = num_embeds_ada_norm
if positional_embeddings and (num_positional_embeddings is None):
raise ValueError(
"If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
)
if positional_embeddings == "sinusoidal":
self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
else:
self.pos_embed = None
# Define 3 blocks. Each block has its own normalization layer.
# 1. Self-Attn
if norm_type == "ada_norm":
self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
elif norm_type == "ada_norm_zero":
self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
elif norm_type == "ada_norm_continuous":
self.norm1 = AdaLayerNormContinuous(
dim,
ada_norm_continous_conditioning_embedding_dim,
norm_elementwise_affine,
norm_eps,
ada_norm_bias,
"rms_norm",
)
else:
self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
cross_attention_dim=cross_attention_dim if only_cross_attention else None,
upcast_attention=upcast_attention,
out_bias=attention_out_bias,
)
# 2. Cross-Attn
if cross_attention_dim is not None or double_self_attention:
# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
# the second cross attention block.
if norm_type == "ada_norm":
self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
elif norm_type == "ada_norm_continuous":
self.norm2 = AdaLayerNormContinuous(
dim,
ada_norm_continous_conditioning_embedding_dim,
norm_elementwise_affine,
norm_eps,
ada_norm_bias,
"rms_norm",
)
else:
self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim if not double_self_attention else None,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
out_bias=attention_out_bias,
) # is self-attn if encoder_hidden_states is none
else:
if norm_type == "ada_norm_single": # For Latte
self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
else:
self.norm2 = None
self.attn2 = None
# 3. Feed-forward
if norm_type == "ada_norm_continuous":
self.norm3 = AdaLayerNormContinuous(
dim,
ada_norm_continous_conditioning_embedding_dim,
norm_elementwise_affine,
norm_eps,
ada_norm_bias,
"layer_norm",
)
elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm"]:
self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
elif norm_type == "layer_norm_i2vgen":
self.norm3 = None
self.ff = FeedForward(
dim,
dropout=dropout,
activation_fn=activation_fn,
final_dropout=final_dropout,
inner_dim=ff_inner_dim,
bias=ff_bias,
)
# 4. Fuser
if attention_type == "gated" or attention_type == "gated-text-image":
self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
# 5. Scale-shift for PixArt-Alpha.
if norm_type == "ada_norm_single":
self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = 0
def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
# Sets chunk feed-forward
self._chunk_size = chunk_size
self._chunk_dim = dim
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
timestep: Optional[torch.LongTensor] = None,
cross_attention_kwargs: Dict[str, Any] = None,
class_labels: Optional[torch.LongTensor] = None,
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
) -> torch.Tensor:
if cross_attention_kwargs is not None:
if cross_attention_kwargs.get("scale", None) is not None:
logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
# Notice that normalization is always applied before the real computation in the following blocks.
# 0. Self-Attention
batch_size = hidden_states.shape[0]
if self.norm_type == "ada_norm":
norm_hidden_states = self.norm1(hidden_states, timestep)
elif self.norm_type == "ada_norm_zero":
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
)
elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
norm_hidden_states = self.norm1(hidden_states)
elif self.norm_type == "ada_norm_continuous":
norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
elif self.norm_type == "ada_norm_single":
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
).chunk(6, dim=1)
norm_hidden_states = self.norm1(hidden_states)
norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
else:
raise ValueError("Incorrect norm used")
if self.pos_embed is not None:
norm_hidden_states = self.pos_embed(norm_hidden_states)
# 1. Prepare GLIGEN inputs
cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
attention_mask=attention_mask,
**cross_attention_kwargs,
)
if self.norm_type == "ada_norm_zero":
attn_output = gate_msa.unsqueeze(1) * attn_output
elif self.norm_type == "ada_norm_single":
attn_output = gate_msa * attn_output
hidden_states = attn_output + hidden_states
if hidden_states.ndim == 4:
hidden_states = hidden_states.squeeze(1)
# 1.2 GLIGEN Control
if gligen_kwargs is not None:
hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
# 3. Cross-Attention
if self.attn2 is not None:
if self.norm_type == "ada_norm":
norm_hidden_states = self.norm2(hidden_states, timestep)
elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
norm_hidden_states = self.norm2(hidden_states)
elif self.norm_type == "ada_norm_single":
# For PixArt norm2 isn't applied here:
# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
norm_hidden_states = hidden_states
elif self.norm_type == "ada_norm_continuous":
norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
else:
raise ValueError("Incorrect norm")
if self.pos_embed is not None and self.norm_type != "ada_norm_single":
norm_hidden_states = self.pos_embed(norm_hidden_states)
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
**cross_attention_kwargs,
)
hidden_states = attn_output + hidden_states
# 4. Feed-forward
# i2vgen doesn't have this norm 🤷♂️
if self.norm_type == "ada_norm_continuous":
norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
elif not self.norm_type == "ada_norm_single":
norm_hidden_states = self.norm3(hidden_states)
if self.norm_type == "ada_norm_zero":
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
if self.norm_type == "ada_norm_single":
norm_hidden_states = self.norm2(hidden_states)
norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
if self._chunk_size is not None:
# "feed_forward_chunk_size" can be used to save memory
ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
else:
ff_output = self.ff(norm_hidden_states)
if self.norm_type == "ada_norm_zero":
ff_output = gate_mlp.unsqueeze(1) * ff_output
elif self.norm_type == "ada_norm_single":
ff_output = gate_mlp * ff_output
hidden_states = ff_output + hidden_states
if hidden_states.ndim == 4:
hidden_states = hidden_states.squeeze(1)
return hidden_states
class LuminaFeedForward(nn.Module):
r"""
A feed-forward layer.
Parameters:
hidden_size (`int`):
The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
hidden representations.
intermediate_size (`int`): The intermediate dimension of the feedforward layer.
multiple_of (`int`, *optional*): Value to ensure hidden dimension is a multiple
of this value.
ffn_dim_multiplier (float, *optional*): Custom multiplier for hidden
dimension. Defaults to None.
"""
def __init__(
self,
dim: int,
inner_dim: int,
multiple_of: Optional[int] = 256,
ffn_dim_multiplier: Optional[float] = None,
):
super().__init__()
inner_dim = int(2 * inner_dim / 3)
# custom hidden_size factor multiplier
if ffn_dim_multiplier is not None:
inner_dim = int(ffn_dim_multiplier * inner_dim)
inner_dim = multiple_of * ((inner_dim + multiple_of - 1) // multiple_of)
self.linear_1 = nn.Linear(
dim,
inner_dim,
bias=False,
)
self.linear_2 = nn.Linear(
inner_dim,
dim,
bias=False,
)
self.linear_3 = nn.Linear(
dim,
inner_dim,
bias=False,
)
self.silu = FP32SiLU()
def forward(self, x):
return self.linear_2(self.silu(self.linear_1(x)) * self.linear_3(x))
@maybe_allow_in_graph
class TemporalBasicTransformerBlock(nn.Module):
r"""
A basic Transformer block for video like data.
Parameters:
dim (`int`): The number of channels in the input and output.
time_mix_inner_dim (`int`): The number of channels for temporal attention.
num_attention_heads (`int`): The number of heads to use for multi-head attention.
attention_head_dim (`int`): The number of channels in each head.
cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
"""
def __init__(
self,
dim: int,
time_mix_inner_dim: int,
num_attention_heads: int,
attention_head_dim: int,
cross_attention_dim: Optional[int] = None,
):
super().__init__()
self.is_res = dim == time_mix_inner_dim
self.norm_in = nn.LayerNorm(dim)
# Define 3 blocks. Each block has its own normalization layer.
# 1. Self-Attn
self.ff_in = FeedForward(
dim,
dim_out=time_mix_inner_dim,
activation_fn="geglu",
)
self.norm1 = nn.LayerNorm(time_mix_inner_dim)
self.attn1 = Attention(
query_dim=time_mix_inner_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
cross_attention_dim=None,
)
# 2. Cross-Attn
if cross_attention_dim is not None:
# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
# the second cross attention block.
self.norm2 = nn.LayerNorm(time_mix_inner_dim)
self.attn2 = Attention(
query_dim=time_mix_inner_dim,
cross_attention_dim=cross_attention_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
) # is self-attn if encoder_hidden_states is none
else:
self.norm2 = None
self.attn2 = None
# 3. Feed-forward
self.norm3 = nn.LayerNorm(time_mix_inner_dim)
self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = None
def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
# Sets chunk feed-forward
self._chunk_size = chunk_size
# chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
self._chunk_dim = 1
def forward(
self,
hidden_states: torch.Tensor,
num_frames: int,
encoder_hidden_states: Optional[torch.Tensor] = None,
) -> torch.Tensor:
# Notice that normalization is always applied before the real computation in the following blocks.
# 0. Self-Attention
batch_size = hidden_states.shape[0]
batch_frames, seq_length, channels = hidden_states.shape
batch_size = batch_frames // num_frames
hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
hidden_states = hidden_states.permute(0, 2, 1, 3)
hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
residual = hidden_states
hidden_states = self.norm_in(hidden_states)
if self._chunk_size is not None:
hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
else:
hidden_states = self.ff_in(hidden_states)
if self.is_res:
hidden_states = hidden_states + residual
norm_hidden_states = self.norm1(hidden_states)
attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
hidden_states = attn_output + hidden_states
# 3. Cross-Attention
if self.attn2 is not None:
norm_hidden_states = self.norm2(hidden_states)
attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
hidden_states = attn_output + hidden_states
# 4. Feed-forward
norm_hidden_states = self.norm3(hidden_states)
if self._chunk_size is not None:
ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
else:
ff_output = self.ff(norm_hidden_states)
if self.is_res:
hidden_states = ff_output + hidden_states
else:
hidden_states = ff_output
hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
hidden_states = hidden_states.permute(0, 2, 1, 3)
hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
return hidden_states
class SkipFFTransformerBlock(nn.Module):
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
kv_input_dim: int,
kv_input_dim_proj_use_bias: bool,
dropout=0.0,
cross_attention_dim: Optional[int] = None,
attention_bias: bool = False,
attention_out_bias: bool = True,
):
super().__init__()
if kv_input_dim != dim:
self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
else:
self.kv_mapper = None
self.norm1 = RMSNorm(dim, 1e-06)
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
cross_attention_dim=cross_attention_dim,
out_bias=attention_out_bias,
)
self.norm2 = RMSNorm(dim, 1e-06)
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
out_bias=attention_out_bias,
)
def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
if self.kv_mapper is not None:
encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
norm_hidden_states = self.norm1(hidden_states)
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
**cross_attention_kwargs,
)
hidden_states = attn_output + hidden_states
norm_hidden_states = self.norm2(hidden_states)
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
**cross_attention_kwargs,
)
hidden_states = attn_output + hidden_states
return hidden_states
@maybe_allow_in_graph
class FreeNoiseTransformerBlock(nn.Module):
r"""
A FreeNoise Transformer block.
Parameters:
dim (`int`):
The number of channels in the input and output.
num_attention_heads (`int`):
The number of heads to use for multi-head attention.
attention_head_dim (`int`):
The number of channels in each head.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability to use.
cross_attention_dim (`int`, *optional*):
The size of the encoder_hidden_states vector for cross attention.
activation_fn (`str`, *optional*, defaults to `"geglu"`):
Activation function to be used in feed-forward.
num_embeds_ada_norm (`int`, *optional*):
The number of diffusion steps used during training. See `Transformer2DModel`.
attention_bias (`bool`, defaults to `False`):
Configure if the attentions should contain a bias parameter.
only_cross_attention (`bool`, defaults to `False`):
Whether to use only cross-attention layers. In this case two cross attention layers are used.
double_self_attention (`bool`, defaults to `False`):
Whether to use two self-attention layers. In this case no cross attention layers are used.
upcast_attention (`bool`, defaults to `False`):
Whether to upcast the attention computation to float32. This is useful for mixed precision training.
norm_elementwise_affine (`bool`, defaults to `True`):
Whether to use learnable elementwise affine parameters for normalization.
norm_type (`str`, defaults to `"layer_norm"`):
The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
final_dropout (`bool` defaults to `False`):
Whether to apply a final dropout after the last feed-forward layer.
attention_type (`str`, defaults to `"default"`):
The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
positional_embeddings (`str`, *optional*):
The type of positional embeddings to apply to.
num_positional_embeddings (`int`, *optional*, defaults to `None`):
The maximum number of positional embeddings to apply.
ff_inner_dim (`int`, *optional*):
Hidden dimension of feed-forward MLP.
ff_bias (`bool`, defaults to `True`):
Whether or not to use bias in feed-forward MLP.
attention_out_bias (`bool`, defaults to `True`):
Whether or not to use bias in attention output project layer.
context_length (`int`, defaults to `16`):
The maximum number of frames that the FreeNoise block processes at once.
context_stride (`int`, defaults to `4`):
The number of frames to be skipped before starting to process a new batch of `context_length` frames.
weighting_scheme (`str`, defaults to `"pyramid"`):
The weighting scheme to use for weighting averaging of processed latent frames. As described in the
Equation 9. of the [FreeNoise](https://arxiv.org/abs/2310.15169) paper, "pyramid" is the default setting
used.
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
dropout: float = 0.0,
cross_attention_dim: Optional[int] = None,
activation_fn: str = "geglu",
num_embeds_ada_norm: Optional[int] = None,
attention_bias: bool = False,
only_cross_attention: bool = False,
double_self_attention: bool = False,
upcast_attention: bool = False,
norm_elementwise_affine: bool = True,
norm_type: str = "layer_norm",
norm_eps: float = 1e-5,
final_dropout: bool = False,
positional_embeddings: Optional[str] = None,
num_positional_embeddings: Optional[int] = None,
ff_inner_dim: Optional[int] = None,
ff_bias: bool = True,
attention_out_bias: bool = True,
context_length: int = 16,
context_stride: int = 4,
weighting_scheme: str = "pyramid",
):
super().__init__()
self.dim = dim
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
self.dropout = dropout
self.cross_attention_dim = cross_attention_dim
self.activation_fn = activation_fn
self.attention_bias = attention_bias
self.double_self_attention = double_self_attention
self.norm_elementwise_affine = norm_elementwise_affine
self.positional_embeddings = positional_embeddings
self.num_positional_embeddings = num_positional_embeddings
self.only_cross_attention = only_cross_attention
self.set_free_noise_properties(context_length, context_stride, weighting_scheme)
# We keep these boolean flags for backward-compatibility.
self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
self.use_layer_norm = norm_type == "layer_norm"
self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
raise ValueError(
f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
)
self.norm_type = norm_type
self.num_embeds_ada_norm = num_embeds_ada_norm
if positional_embeddings and (num_positional_embeddings is None):
raise ValueError(
"If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
)
if positional_embeddings == "sinusoidal":
self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
else:
self.pos_embed = None
# Define 3 blocks. Each block has its own normalization layer.
# 1. Self-Attn
self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
cross_attention_dim=cross_attention_dim if only_cross_attention else None,
upcast_attention=upcast_attention,
out_bias=attention_out_bias,
)
# 2. Cross-Attn
if cross_attention_dim is not None or double_self_attention:
self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim if not double_self_attention else None,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
out_bias=attention_out_bias,
) # is self-attn if encoder_hidden_states is none
# 3. Feed-forward
self.ff = FeedForward(
dim,
dropout=dropout,
activation_fn=activation_fn,
final_dropout=final_dropout,
inner_dim=ff_inner_dim,
bias=ff_bias,
)
self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = 0
def _get_frame_indices(self, num_frames: int) -> List[Tuple[int, int]]:
frame_indices = []
for i in range(0, num_frames - self.context_length + 1, self.context_stride):
window_start = i
window_end = min(num_frames, i + self.context_length)
frame_indices.append((window_start, window_end))
return frame_indices
def _get_frame_weights(self, num_frames: int, weighting_scheme: str = "pyramid") -> List[float]:
if weighting_scheme == "flat":
weights = [1.0] * num_frames
elif weighting_scheme == "pyramid":
if num_frames % 2 == 0:
# num_frames = 4 => [1, 2, 2, 1]
mid = num_frames // 2
weights = list(range(1, mid + 1))
weights = weights + weights[::-1]
else:
# num_frames = 5 => [1, 2, 3, 2, 1]
mid = (num_frames + 1) // 2
weights = list(range(1, mid))
weights = weights + [mid] + weights[::-1]
elif weighting_scheme == "delayed_reverse_sawtooth":
if num_frames % 2 == 0:
# num_frames = 4 => [0.01, 2, 2, 1]
mid = num_frames // 2
weights = [0.01] * (mid - 1) + [mid]
weights = weights + list(range(mid, 0, -1))
else:
# num_frames = 5 => [0.01, 0.01, 3, 2, 1]
mid = (num_frames + 1) // 2
weights = [0.01] * mid
weights = weights + list(range(mid, 0, -1))
else:
raise ValueError(f"Unsupported value for weighting_scheme={weighting_scheme}")
return weights
def set_free_noise_properties(
self, context_length: int, context_stride: int, weighting_scheme: str = "pyramid"
) -> None:
self.context_length = context_length
self.context_stride = context_stride
self.weighting_scheme = weighting_scheme
def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0) -> None:
# Sets chunk feed-forward
self._chunk_size = chunk_size
self._chunk_dim = dim
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
cross_attention_kwargs: Dict[str, Any] = None,
*args,
**kwargs,
) -> torch.Tensor:
if cross_attention_kwargs is not None:
if cross_attention_kwargs.get("scale", None) is not None:
logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
# hidden_states: [B x H x W, F, C]
device = hidden_states.device
dtype = hidden_states.dtype
num_frames = hidden_states.size(1)
frame_indices = self._get_frame_indices(num_frames)
frame_weights = self._get_frame_weights(self.context_length, self.weighting_scheme)
frame_weights = torch.tensor(frame_weights, device=device, dtype=dtype).unsqueeze(0).unsqueeze(-1)
is_last_frame_batch_complete = frame_indices[-1][1] == num_frames
# Handle out-of-bounds case if num_frames isn't perfectly divisible by context_length
# For example, num_frames=25, context_length=16, context_stride=4, then we expect the ranges:
# [(0, 16), (4, 20), (8, 24), (10, 26)]
if not is_last_frame_batch_complete:
if num_frames < self.context_length:
raise ValueError(f"Expected {num_frames=} to be greater or equal than {self.context_length=}")
last_frame_batch_length = num_frames - frame_indices[-1][1]
frame_indices.append((num_frames - self.context_length, num_frames))
num_times_accumulated = torch.zeros((1, num_frames, 1), device=device)
accumulated_values = torch.zeros_like(hidden_states)
for i, (frame_start, frame_end) in enumerate(frame_indices):
# The reason for slicing here is to ensure that if (frame_end - frame_start) is to handle
# cases like frame_indices=[(0, 16), (16, 20)], if the user provided a video with 19 frames, or
# essentially a non-multiple of `context_length`.
weights = torch.ones_like(num_times_accumulated[:, frame_start:frame_end])
weights *= frame_weights
hidden_states_chunk = hidden_states[:, frame_start:frame_end]
# Notice that normalization is always applied before the real computation in the following blocks.
# 1. Self-Attention
norm_hidden_states = self.norm1(hidden_states_chunk)
if self.pos_embed is not None:
norm_hidden_states = self.pos_embed(norm_hidden_states)
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
attention_mask=attention_mask,
**cross_attention_kwargs,
)
hidden_states_chunk = attn_output + hidden_states_chunk
if hidden_states_chunk.ndim == 4:
hidden_states_chunk = hidden_states_chunk.squeeze(1)
# 2. Cross-Attention
if self.attn2 is not None:
norm_hidden_states = self.norm2(hidden_states_chunk)
if self.pos_embed is not None and self.norm_type != "ada_norm_single":
norm_hidden_states = self.pos_embed(norm_hidden_states)
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
**cross_attention_kwargs,
)
hidden_states_chunk = attn_output + hidden_states_chunk
if i == len(frame_indices) - 1 and not is_last_frame_batch_complete:
accumulated_values[:, -last_frame_batch_length:] += (
hidden_states_chunk[:, -last_frame_batch_length:] * weights[:, -last_frame_batch_length:]
)
num_times_accumulated[:, -last_frame_batch_length:] += weights[:, -last_frame_batch_length]
else:
accumulated_values[:, frame_start:frame_end] += hidden_states_chunk * weights
num_times_accumulated[:, frame_start:frame_end] += weights
# TODO(aryan): Maybe this could be done in a better way.
#
# Previously, this was:
# hidden_states = torch.where(
# num_times_accumulated > 0, accumulated_values / num_times_accumulated, accumulated_values
# )
#
# The reasoning for the change here is `torch.where` became a bottleneck at some point when golfing memory
# spikes. It is particularly noticeable when the number of frames is high. My understanding is that this comes
# from tensors being copied - which is why we resort to spliting and concatenating here. I've not particularly
# looked into this deeply because other memory optimizations led to more pronounced reductions.
hidden_states = torch.cat(
[
torch.where(num_times_split > 0, accumulated_split / num_times_split, accumulated_split)
for accumulated_split, num_times_split in zip(
accumulated_values.split(self.context_length, dim=1),
num_times_accumulated.split(self.context_length, dim=1),
)
],
dim=1,
).to(dtype)
# 3. Feed-forward
norm_hidden_states = self.norm3(hidden_states)
if self._chunk_size is not None:
ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
else:
ff_output = self.ff(norm_hidden_states)
hidden_states = ff_output + hidden_states
if hidden_states.ndim == 4:
hidden_states = hidden_states.squeeze(1)
return hidden_states
class FeedForward(nn.Module):
r"""
A feed-forward layer.
Parameters:
dim (`int`): The number of channels in the input.
dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
"""
def __init__(
self,
dim: int,
dim_out: Optional[int] = None,
mult: int = 4,
dropout: float = 0.0,
activation_fn: str = "geglu",
final_dropout: bool = False,
inner_dim=None,
bias: bool = True,
):
super().__init__()
if inner_dim is None:
inner_dim = int(dim * mult)
dim_out = dim_out if dim_out is not None else dim
if activation_fn == "gelu":
act_fn = GELU(dim, inner_dim, bias=bias)
if activation_fn == "gelu-approximate":
act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
elif activation_fn == "geglu":
act_fn = GEGLU(dim, inner_dim, bias=bias)
elif activation_fn == "geglu-approximate":
act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
elif activation_fn == "swiglu":
act_fn = SwiGLU(dim, inner_dim, bias=bias)
self.net = nn.ModuleList([])
# project in
self.net.append(act_fn)
# project dropout
self.net.append(nn.Dropout(dropout))
# project out
self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
# FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
if final_dropout:
self.net.append(nn.Dropout(dropout))
def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
if len(args) > 0 or kwargs.get("scale", None) is not None:
deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
deprecate("scale", "1.0.0", deprecation_message)
for module in self.net:
hidden_states = module(hidden_states)
return hidden_states
================================================
FILE: CogVideo/finetune/models/attention_processor.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import math
from typing import Callable, List, Optional, Tuple, Union
from einops import rearrange, repeat
import torch
import torch.nn.functional as F
from torch import nn
from diffusers.image_processor import IPAdapterMaskProcessor
from diffusers.utils import deprecate, logging
from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
from diffusers.utils.torch_utils import is_torch_version, maybe_allow_in_graph
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
if is_torch_npu_available():
import torch_npu
if is_xformers_available():
import xformers
import xformers.ops
else:
xformers = None
@maybe_allow_in_graph
class Attention(nn.Module):
r"""
A cross attention layer.
Parameters:
query_dim (`int`):
The number of channels in the query.
cross_attention_dim (`int`, *optional*):
The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
heads (`int`, *optional*, defaults to 8):
The number of heads to use for multi-head attention.
kv_heads (`int`, *optional*, defaults to `None`):
The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
`kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
Query Attention (MQA) otherwise GQA is used.
dim_head (`int`, *optional*, defaults to 64):
The number of channels in each head.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability to use.
bias (`bool`, *optional*, defaults to False):
Set to `True` for the query, key, and value linear layers to contain a bias parameter.
upcast_attention (`bool`, *optional*, defaults to False):
Set to `True` to upcast the attention computation to `float32`.
upcast_softmax (`bool`, *optional*, defaults to False):
Set to `True` to upcast the softmax computation to `float32`.
cross_attention_norm (`str`, *optional*, defaults to `None`):
The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
The number of groups to use for the group norm in the cross attention.
added_kv_proj_dim (`int`, *optional*, defaults to `None`):
The number of channels to use for the added key and value projections. If `None`, no projection is used.
norm_num_groups (`int`, *optional*, defaults to `None`):
The number of groups to use for the group norm in the attention.
spatial_norm_dim (`int`, *optional*, defaults to `None`):
The number of channels to use for the spatial normalization.
out_bias (`bool`, *optional*, defaults to `True`):
Set to `True` to use a bias in the output linear layer.
scale_qk (`bool`, *optional*, defaults to `True`):
Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
only_cross_attention (`bool`, *optional*, defaults to `False`):
Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
`added_kv_proj_dim` is not `None`.
eps (`float`, *optional*, defaults to 1e-5):
An additional value added to the denominator in group normalization that is used for numerical stability.
rescale_output_factor (`float`, *optional*, defaults to 1.0):
A factor to rescale the output by dividing it with this value.
residual_connection (`bool`, *optional*, defaults to `False`):
Set to `True` to add the residual connection to the output.
_from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
Set to `True` if the attention block is loaded from a deprecated state dict.
processor (`AttnProcessor`, *optional*, defaults to `None`):
The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
`AttnProcessor` otherwise.
"""
def __init__(
self,
query_dim: int,
cross_attention_dim: Optional[int] = None,
heads: int = 8,
kv_heads: Optional[int] = None,
dim_head: int = 64,
dropout: float = 0.0,
bias: bool = False,
upcast_attention: bool = False,
upcast_softmax: bool = False,
cross_attention_norm: Optional[str] = None,
cross_attention_norm_num_groups: int = 32,
qk_norm: Optional[str] = None,
added_kv_proj_dim: Optional[int] = None,
added_proj_bias: Optional[bool] = True,
norm_num_groups: Optional[int] = None,
spatial_norm_dim: Optional[int] = None,
out_bias: bool = True,
scale_qk: bool = True,
only_cross_attention: bool = False,
eps: float = 1e-5,
rescale_output_factor: float = 1.0,
residual_connection: bool = False,
_from_deprecated_attn_block: bool = False,
processor: Optional["AttnProcessor"] = None,
out_dim: int = None,
context_pre_only=None,
pre_only=False,
elementwise_affine: bool = True,
):
super().__init__()
# To prevent circular import.
from diffusers.models.normalization import FP32LayerNorm, RMSNorm
self.inner_dim = out_dim if out_dim is not None else dim_head * heads
self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
self.query_dim = query_dim
self.use_bias = bias
self.is_cross_attention = cross_attention_dim is not None
self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
self.upcast_attention = upcast_attention
self.upcast_softmax = upcast_softmax
self.rescale_output_factor = rescale_output_factor
self.residual_connection = residual_connection
self.dropout = dropout
self.fused_projections = False
self.out_dim = out_dim if out_dim is not None else query_dim
self.context_pre_only = context_pre_only
self.pre_only = pre_only
# we make use of this private variable to know whether this class is loaded
# with an deprecated state dict so that we can convert it on the fly
self._from_deprecated_attn_block = _from_deprecated_attn_block
self.scale_qk = scale_qk
self.scale = dim_head**-0.5 if self.scale_qk else 1.0
self.heads = out_dim // dim_head if out_dim is not None else heads
# for slice_size > 0 the attention score computation
# is split across the batch axis to save memory
# You can set slice_size with `set_attention_slice`
self.sliceable_head_dim = heads
self.added_kv_proj_dim = added_kv_proj_dim
self.only_cross_attention = only_cross_attention
if self.added_kv_proj_dim is None and self.only_cross_attention:
raise ValueError(
"`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
)
if norm_num_groups is not None:
self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
else:
self.group_norm = None
if spatial_norm_dim is not None:
self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
else:
self.spatial_norm = None
if qk_norm is None:
self.norm_q = None
self.norm_k = None
elif qk_norm == "layer_norm":
self.norm_q = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
self.norm_k = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
elif qk_norm == "fp32_layer_norm":
self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
elif qk_norm == "layer_norm_across_heads":
# Lumina applys qk norm across all heads
self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
elif qk_norm == "rms_norm":
self.norm_q = RMSNorm(dim_head, eps=eps)
self.norm_k = RMSNorm(dim_head, eps=eps)
else:
raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None,'layer_norm','fp32_layer_norm','rms_norm'")
if cross_attention_norm is None:
self.norm_cross = None
elif cross_attention_norm == "layer_norm":
self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
elif cross_attention_norm == "group_norm":
if self.added_kv_proj_dim is not None:
# The given `encoder_hidden_states` are initially of shape
# (batch_size, seq_len, added_kv_proj_dim) before being projected
# to (batch_size, seq_len, cross_attention_dim). The norm is applied
# before the projection, so we need to use `added_kv_proj_dim` as
# the number of channels for the group norm.
norm_cross_num_channels = added_kv_proj_dim
else:
norm_cross_num_channels = self.cross_attention_dim
self.norm_cross = nn.GroupNorm(
num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
)
else:
raise ValueError(
f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
)
self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
if not self.only_cross_attention:
# only relevant for the `AddedKVProcessor` classes
self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
else:
self.to_k = None
self.to_v = None
self.added_proj_bias = added_proj_bias
if self.added_kv_proj_dim is not None:
self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
if self.context_pre_only is not None:
self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
if not self.pre_only:
self.to_out = nn.ModuleList([])
self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
self.to_out.append(nn.Dropout(dropout))
if self.context_pre_only is not None and not self.context_pre_only:
self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
if qk_norm is not None and added_kv_proj_dim is not None:
if qk_norm == "fp32_layer_norm":
self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
elif qk_norm == "rms_norm":
self.norm_added_q = RMSNorm(dim_head, eps=eps)
self.norm_added_k = RMSNorm(dim_head, eps=eps)
else:
raise ValueError(
f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`"
)
else:
self.norm_added_q = None
self.norm_added_k = None
# set attention processor
# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
if processor is None:
processor = (
AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
)
self.set_processor(processor)
def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
r"""
Set whether to use npu flash attention from `torch_npu` or not.
"""
if use_npu_flash_attention:
processor = AttnProcessorNPU()
else:
# set attention processor
# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
processor = (
AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
)
self.set_processor(processor)
def set_use_memory_efficient_attention_xformers(
self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
) -> None:
r"""
Set whether to use memory efficient attention from `xformers` or not.
Args:
use_memory_efficient_attention_xformers (`bool`):
Whether to use memory efficient attention from `xformers` or not.
attention_op (`Callable`, *optional*):
The attention operation to use. Defaults to `None` which uses the default attention operation from
`xformers`.
"""
is_custom_diffusion = hasattr(self, "processor") and isinstance(
self.processor,
(CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0),
)
is_added_kv_processor = hasattr(self, "processor") and isinstance(
self.processor,
(
AttnAddedKVProcessor,
AttnAddedKVProcessor2_0,
SlicedAttnAddedKVProcessor,
XFormersAttnAddedKVProcessor,
),
)
if use_memory_efficient_attention_xformers:
if is_added_kv_processor and is_custom_diffusion:
raise NotImplementedError(
f"Memory efficient attention is currently not supported for custom diffusion for attention processor type {self.processor}"
)
if not is_xformers_available():
raise ModuleNotFoundError(
(
"Refer to https://github.com/facebookresearch/xformers for more information on how to install"
" xformers"
),
name="xformers",
)
elif not torch.cuda.is_available():
raise ValueError(
"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
" only available for GPU "
)
else:
try:
# Make sure we can run the memory efficient attention
_ = xformers.ops.memory_efficient_attention(
torch.randn((1, 2, 40), device="cuda"),
torch.randn((1, 2, 40), device="cuda"),
torch.randn((1, 2, 40), device="cuda"),
)
except Exception as e:
raise e
if is_custom_diffusion:
processor = CustomDiffusionXFormersAttnProcessor(
train_kv=self.processor.train_kv,
train_q_out=self.processor.train_q_out,
hidden_size=self.processor.hidden_size,
cross_attention_dim=self.processor.cross_attention_dim,
attention_op=attention_op,
)
processor.load_state_dict(self.processor.state_dict())
if hasattr(self.processor, "to_k_custom_diffusion"):
processor.to(self.processor.to_k_custom_diffusion.weight.device)
elif is_added_kv_processor:
# TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
# which uses this type of cross attention ONLY because the attention mask of format
# [0, ..., -10.000, ..., 0, ...,] is not supported
# throw warning
logger.info(
"Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
)
processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
else:
processor = XFormersAttnProcessor(attention_op=attention_op)
else:
if is_custom_diffusion:
attn_processor_class = (
CustomDiffusionAttnProcessor2_0
if hasattr(F, "scaled_dot_product_attention")
else CustomDiffusionAttnProcessor
)
processor = attn_processor_class(
train_kv=self.processor.train_kv,
train_q_out=self.processor.train_q_out,
hidden_size=self.processor.hidden_size,
cross_attention_dim=self.processor.cross_attention_dim,
)
processor.load_state_dict(self.processor.state_dict())
if hasattr(self.processor, "to_k_custom_diffusion"):
processor.to(self.processor.to_k_custom_diffusion.weight.device)
else:
# set attention processor
# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
processor = (
AttnProcessor2_0()
if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
else AttnProcessor()
)
self.set_processor(processor)
def set_attention_slice(self, slice_size: int) -> None:
r"""
Set the slice size for attention computation.
Args:
slice_size (`int`):
The slice size for attention computation.
"""
if slice_size is not None and slice_size > self.sliceable_head_dim:
raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
if slice_size is not None and self.added_kv_proj_dim is not None:
processor = SlicedAttnAddedKVProcessor(slice_size)
elif slice_size is not None:
processor = SlicedAttnProcessor(slice_size)
elif self.added_kv_proj_dim is not None:
processor = AttnAddedKVProcessor()
else:
# set attention processor
# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
processor = (
AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
)
self.set_processor(processor)
def set_processor(self, processor: "AttnProcessor") -> None:
r"""
Set the attention processor to use.
Args:
processor (`AttnProcessor`):
The attention processor to use.
"""
# if current processor is in `self._modules` and if passed `processor` is not, we need to
# pop `processor` from `self._modules`
if (
hasattr(self, "processor")
and isinstance(self.processor, torch.nn.Module)
and not isinstance(processor, torch.nn.Module)
):
logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
self._modules.pop("processor")
self.processor = processor
def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
r"""
Get the attention processor in use.
Args:
return_deprecated_lora (`bool`, *optional*, defaults to `False`):
Set to `True` to return the deprecated LoRA attention processor.
Returns:
"AttentionProcessor": The attention processor in use.
"""
if not return_deprecated_lora:
return self.processor
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
**cross_attention_kwargs,
) -> torch.Tensor:
r"""
The forward method of the `Attention` class.
Args:
hidden_states (`torch.Tensor`):
The hidden states of the query.
encoder_hidden_states (`torch.Tensor`, *optional*):
The hidden states of the encoder.
attention_mask (`torch.Tensor`, *optional*):
The attention mask to use. If `None`, no mask is applied.
**cross_attention_kwargs:
Additional keyword arguments to pass along to the cross attention.
Returns:
`torch.Tensor`: The output of the attention layer.
"""
# The `Attention` class can call different attention processors / attention functions
# here we simply pass along all tensors to the selected processor class
# For standard processors that are defined here, `**cross_attention_kwargs` is empty
attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
quiet_attn_parameters = {"ip_adapter_masks"}
unused_kwargs = [
k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters
]
if len(unused_kwargs) > 0:
logger.warning(
f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
)
cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
return self.processor(
self,
hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
**cross_attention_kwargs,
)
def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
r"""
Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
is the number of heads initialized while constructing the `Attention` class.
Args:
tensor (`torch.Tensor`): The tensor to reshape.
Returns:
`torch.Tensor`: The reshaped tensor.
"""
head_size = self.heads
batch_size, seq_len, dim = tensor.shape
tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
return tensor
def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
r"""
Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
the number of heads initialized while constructing the `Attention` class.
Args:
tensor (`torch.Tensor`): The tensor to reshape.
out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
reshaped to `[batch_size * heads, seq_len, dim // heads]`.
Returns:
`torch.Tensor`: The reshaped tensor.
"""
head_size = self.heads
if tensor.ndim == 3:
batch_size, seq_len, dim = tensor.shape
extra_dim = 1
else:
batch_size, extra_dim, seq_len, dim = tensor.shape
tensor = tensor.reshape(batch_size, seq_len * extra_dim, head_size, dim // head_size)
tensor = tensor.permute(0, 2, 1, 3)
if out_dim == 3:
tensor = tensor.reshape(batch_size * head_size, seq_len * extra_dim, dim // head_size)
return tensor
def get_attention_scores(
self, query: torch.Tensor, key: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
r"""
Compute the attention scores.
Args:
query (`torch.Tensor`): The query tensor.
key (`torch.Tensor`): The key tensor.
attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
Returns:
`torch.Tensor`: The attention probabilities/scores.
"""
dtype = query.dtype
if self.upcast_attention:
query = query.float()
key = key.float()
if attention_mask is None:
baddbmm_input = torch.empty(
query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
)
beta = 0
else:
baddbmm_input = attention_mask
beta = 1
attention_scores = torch.baddbmm(
baddbmm_input,
query,
key.transpose(-1, -2),
beta=beta,
alpha=self.scale,
)
del baddbmm_input
if self.upcast_softmax:
attention_scores = attention_scores.float()
attention_probs = attention_scores.softmax(dim=-1)
del attention_scores
attention_probs = attention_probs.to(dtype)
return attention_probs
def prepare_attention_mask(
self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
) -> torch.Tensor:
r"""
Prepare the attention mask for the attention computation.
Args:
attention_mask (`torch.Tensor`):
The attention mask to prepare.
target_length (`int`):
The target length of the attention mask. This is the length of the attention mask after padding.
batch_size (`int`):
The batch size, which is used to repeat the attention mask.
out_dim (`int`, *optional*, defaults to `3`):
The output dimension of the attention mask. Can be either `3` or `4`.
Returns:
`torch.Tensor`: The prepared attention mask.
"""
head_size = self.heads
if attention_mask is None:
return attention_mask
current_length: int = attention_mask.shape[-1]
if current_length != target_length:
if attention_mask.device.type == "mps":
# HACK: MPS: Does not support padding by greater than dimension of input tensor.
# Instead, we can manually construct the padding tensor.
padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
attention_mask = torch.cat([attention_mask, padding], dim=2)
else:
# TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
# we want to instead pad by (0, remaining_length), where remaining_length is:
# remaining_length: int = target_length - current_length
# TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
if out_dim == 3:
if attention_mask.shape[0] < batch_size * head_size:
attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
elif out_dim == 4:
attention_mask = attention_mask.unsqueeze(1)
attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
return attention_mask
def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
r"""
Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
`Attention` class.
Args:
encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
Returns:
`torch.Tensor`: The normalized encoder hidden states.
"""
assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
if isinstance(self.norm_cross, nn.LayerNorm):
encoder_hidden_states = self.norm_cross(encoder_hidden_states)
elif isinstance(self.norm_cross, nn.GroupNorm):
# Group norm norms along the channels dimension and expects
# input to be in the shape of (N, C, *). In this case, we want
# to norm along the hidden dimension, so we need to move
# (batch_size, sequence_length, hidden_size) ->
# (batch_size, hidden_size, sequence_length)
encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
encoder_hidden_states = self.norm_cross(encoder_hidden_states)
encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
else:
assert False
return encoder_hidden_states
@torch.no_grad()
def fuse_projections(self, fuse=True):
device = self.to_q.weight.data.device
dtype = self.to_q.weight.data.dtype
if not self.is_cross_attention:
# fetch weight matrices.
concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
in_features = concatenated_weights.shape[1]
out_features = concatenated_weights.shape[0]
# create a new single projection layer and copy over the weights.
self.to_qkv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
self.to_qkv.weight.copy_(concatenated_weights)
if self.use_bias:
concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
self.to_qkv.bias.copy_(concatenated_bias)
else:
concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
in_features = concatenated_weights.shape[1]
out_features = concatenated_weights.shape[0]
self.to_kv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
self.to_kv.weight.copy_(concatenated_weights)
if self.use_bias:
concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
self.to_kv.bias.copy_(concatenated_bias)
# handle added projections for SD3 and others.
if hasattr(self, "add_q_proj") and hasattr(self, "add_k_proj") and hasattr(self, "add_v_proj"):
concatenated_weights = torch.cat(
[self.add_q_proj.weight.data, self.add_k_proj.weight.data, self.add_v_proj.weight.data]
)
in_features = concatenated_weights.shape[1]
out_features = concatenated_weights.shape[0]
self.to_added_qkv = nn.Linear(
in_features, out_features, bias=self.added_proj_bias, device=device, dtype=dtype
)
self.to_added_qkv.weight.copy_(concatenated_weights)
if self.added_proj_bias:
concatenated_bias = torch.cat(
[self.add_q_proj.bias.data, self.add_k_proj.bias.data, self.add_v_proj.bias.data]
)
self.to_added_qkv.bias.copy_(concatenated_bias)
self.fused_projections = fuse
class AttnProcessor:
r"""
Default processor for performing attention-related computations.
"""
def __call__(
self,
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
temb: Optional[torch.Tensor] = None,
*args,
**kwargs,
) -> torch.Tensor:
if len(args) > 0 or kwargs.get("scale", None) is not None:
deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
deprecate("scale", "1.0.0", deprecation_message)
residual = hidden_states
if attn.spatial_norm is not None:
hidden_states = attn.spatial_norm(hidden_states, temb)
input_ndim = hidden_states.ndim
if input_ndim == 4:
batch_size, channel, height, width = hidden_states.shape
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
if attn.group_norm is not None:
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
query = attn.head_to_batch_dim(query)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
attention_probs = attn.get_attention_scores(query, key, attention_mask)
hidden_states = torch.bmm(attention_probs, value)
hidden_states = attn.batch_to_head_dim(hidden_states)
# linear proj
hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
if input_ndim == 4:
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
if attn.residual_connection:
hidden_states = hidden_states + residual
hidden_states = hidden_states / attn.rescale_output_factor
return hidden_states
class CustomDiffusionAttnProcessor(nn.Module):
r"""
Processor for implementing attention for the Custom Diffusion method.
Args:
train_kv (`bool`, defaults to `True`):
Whether to newly train the key and value matrices corresponding to the text features.
train_q_out (`bool`, defaults to `True`):
Whether to newly train query matrices corresponding to the latent image features.
hidden_size (`int`, *optional*, defaults to `None`):
The hidden size of the attention layer.
cross_attention_dim (`int`, *optional*, defaults to `None`):
The number of channels in the `encoder_hidden_states`.
out_bias (`bool`, defaults to `True`):
Whether to include the bias parameter in `train_q_out`.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability to use.
"""
def __init__(
self,
train_kv: bool = True,
train_q_out: bool = True,
hidden_size: Optional[int] = None,
cross_attention_dim: Optional[int] = None,
out_bias: bool = True,
dropout: float = 0.0,
):
super().__init__()
self.train_kv = train_kv
self.train_q_out = train_q_out
self.hidden_size = hidden_size
self.cross_attention_dim = cross_attention_dim
# `_custom_diffusion` id for easy serialization and loading.
if self.train_kv:
self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
if self.train_q_out:
self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
self.to_out_custom_diffusion = nn.ModuleList([])
self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
self.to_out_custom_diffusion.append(nn.Dropout(dropout))
def __call__(
self,
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
batch_size, sequence_length, _ = hidden_states.shape
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
if self.train_q_out:
query = self.to_q_custom_diffusion(hidden_states).to(attn.to_q.weight.dtype)
else:
query = attn.to_q(hidden_states.to(attn.to_q.weight.dtype))
if encoder_hidden_states is None:
crossattn = False
encoder_hidden_states = hidden_states
else:
crossattn = True
if attn.norm_cross:
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
if self.train_kv:
key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
key = key.to(attn.to_q.weight.dtype)
value = value.to(attn.to_q.weight.dtype)
else:
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
if crossattn:
detach = torch.ones_like(key)
detach[:, :1, :] = detach[:, :1, :] * 0.0
key = detach * key + (1 - detach) * key.detach()
value = detach * value + (1 - detach) * value.detach()
query = attn.head_to_batch_dim(query)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
attention_probs = attn.get_attention_scores(query, key, attention_mask)
hidden_states = torch.bmm(attention_probs, value)
hidden_states = attn.batch_to_head_dim(hidden_states)
if self.train_q_out:
# linear proj
hidden_states = self.to_out_custom_diffusion[0](hidden_states)
# dropout
hidden_states = self.to_out_custom_diffusion[1](hidden_states)
else:
# linear proj
hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
return hidden_states
class AttnAddedKVProcessor:
r"""
Processor for performing attention-related computations with extra learnable key and value matrices for the text
encoder.
"""
def __call__(
self,
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
*args,
**kwargs,
) -> torch.Tensor:
if len(args) > 0 or kwargs.get("scale", None) is not None:
deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
deprecate("scale", "1.0.0", deprecation_message)
residual = hidden_states
hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
batch_size, sequence_length, _ = hidden_states.shape
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
query = attn.to_q(hidden_states)
query = attn.head_to_batch_dim(query)
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
if not attn.only_cross_attention:
key = attn.to_k(hidden_states)
value = attn.to_v(hidden_states)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
else:
key = encoder_hidden_states_key_proj
value = encoder_hidden_states_value_proj
attention_probs = attn.get_attention_scores(query, key, attention_mask)
hidden_states = torch.bmm(attention_probs, value)
hidden_states = attn.batch_to_head_dim(hidden_states)
# linear proj
hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
hidden_states = hidden_states + residual
return hidden_states
class AttnAddedKVProcessor2_0:
r"""
Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra
learnable key and value matrices for the text encoder.
"""
def __init__(self):
if not hasattr(F, "scaled_dot_product_attention"):
raise ImportError(
"AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
)
def __call__(
self,
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
*args,
**kwargs,
) -> torch.Tensor:
if len(args) > 0 or kwargs.get("scale", None) is not None:
deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
deprecate("scale", "1.0.0", deprecation_message)
residual = hidden_states
hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
batch_size, sequence_length, _ = hidden_states.shape
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
hidden_states
gitextract_mycvvgja/
├── CogVideo/
│ ├── .github/
│ │ ├── ISSUE_TEMPLATE/
│ │ │ ├── bug_report.yaml
│ │ │ └── feature-request.yaml
│ │ └── PULL_REQUEST_TEMPLATE/
│ │ └── pr_template.md
│ ├── .gitignore
│ ├── LICENSE
│ ├── MODEL_LICENSE
│ ├── README.md
│ ├── README_ja.md
│ ├── README_zh.md
│ ├── download.sh
│ ├── finetune/
│ │ ├── README.md
│ │ ├── README_ja.md
│ │ ├── README_zh.md
│ │ ├── accelerate_config_machine_single.yaml
│ │ ├── accelerate_config_machine_single_debug.yaml
│ │ ├── finetune_single_rank_injector.sh
│ │ ├── finetune_single_rank_lora.sh
│ │ ├── hostfile.txt
│ │ ├── models/
│ │ │ ├── attention.py
│ │ │ ├── attention_processor.py
│ │ │ ├── cogvideox_transformer_3d.py
│ │ │ ├── embeddings.py
│ │ │ ├── pipeline_cogvideox.py
│ │ │ ├── pipeline_output.py
│ │ │ └── utils.py
│ │ ├── train_cogvideox_injector.py
│ │ └── train_cogvideox_lora.py
│ ├── inference/
│ │ ├── 3dtrajmaster_inference.py
│ │ ├── entity_zoo.txt
│ │ └── location_zoo.txt
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── tools/
│ │ ├── caption/
│ │ │ ├── README.md
│ │ │ ├── README_ja.md
│ │ │ ├── README_zh.md
│ │ │ ├── requirements.txt
│ │ │ └── video_caption.py
│ │ ├── convert_weight_sat2hf.py
│ │ ├── export_sat_lora_weight.py
│ │ ├── llm_flux_cogvideox/
│ │ │ ├── generate.sh
│ │ │ ├── gradio_page.py
│ │ │ └── llm_flux_cogvideox.py
│ │ ├── load_cogvideox_lora.py
│ │ ├── parallel_inference/
│ │ │ ├── parallel_inference_xdit.py
│ │ │ └── run.sh
│ │ ├── replicate/
│ │ │ ├── cog.yaml
│ │ │ ├── predict_i2v.py
│ │ │ └── predict_t2v.py
│ │ └── venhancer/
│ │ ├── README.md
│ │ ├── README_ja.md
│ │ └── README_zh.md
│ └── weights/
│ └── put weights here.txt
├── README.md
├── dataset/
│ ├── load_dataset.py
│ ├── traj_vis/
│ │ ├── D_loc1_61_t3n13_003d_Hemi12_1.json
│ │ ├── Hemi12_transforms.json
│ │ └── location_data_desert.json
│ ├── utils.py
│ └── vis_trajectory.py
└── eval/
├── GVHMR/
│ ├── .gitignore
│ ├── .gitmodules
│ ├── LICENSE
│ ├── README.md
│ ├── docs/
│ │ └── INSTALL.md
│ ├── download_eval_pose.sh
│ ├── eval.sh
│ ├── hmr4d/
│ │ ├── __init__.py
│ │ ├── build_gvhmr.py
│ │ ├── configs/
│ │ │ ├── __init__.py
│ │ │ ├── data/
│ │ │ │ └── mocap/
│ │ │ │ ├── testY.yaml
│ │ │ │ └── trainX_testY.yaml
│ │ │ ├── demo.yaml
│ │ │ ├── exp/
│ │ │ │ └── gvhmr/
│ │ │ │ └── mixed/
│ │ │ │ └── mixed.yaml
│ │ │ ├── global/
│ │ │ │ ├── debug/
│ │ │ │ │ ├── debug_train.yaml
│ │ │ │ │ └── debug_train_limit_data.yaml
│ │ │ │ └── task/
│ │ │ │ └── gvhmr/
│ │ │ │ ├── test_3dpw.yaml
│ │ │ │ ├── test_3dpw_emdb_rich.yaml
│ │ │ │ ├── test_emdb.yaml
│ │ │ │ └── test_rich.yaml
│ │ │ ├── hydra/
│ │ │ │ └── default.yaml
│ │ │ ├── siga24_release.yaml
│ │ │ ├── store_gvhmr.py
│ │ │ └── train.yaml
│ │ ├── datamodule/
│ │ │ └── mocap_trainX_testY.py
│ │ ├── dataset/
│ │ │ ├── bedlam/
│ │ │ │ ├── bedlam.py
│ │ │ │ ├── resource/
│ │ │ │ │ └── vname2lwh.pt
│ │ │ │ └── utils.py
│ │ │ ├── emdb/
│ │ │ │ ├── emdb_motion_test.py
│ │ │ │ └── utils.py
│ │ │ ├── h36m/
│ │ │ │ ├── camera-parameters.json
│ │ │ │ ├── h36m.py
│ │ │ │ └── utils.py
│ │ │ ├── imgfeat_motion/
│ │ │ │ └── base_dataset.py
│ │ │ ├── pure_motion/
│ │ │ │ ├── amass.py
│ │ │ │ ├── base_dataset.py
│ │ │ │ ├── cam_traj_utils.py
│ │ │ │ └── utils.py
│ │ │ ├── rich/
│ │ │ │ ├── resource/
│ │ │ │ │ ├── cam2params.pt
│ │ │ │ │ ├── seqname2imgrange.json
│ │ │ │ │ ├── test.txt
│ │ │ │ │ ├── train.txt
│ │ │ │ │ ├── val.txt
│ │ │ │ │ └── w2az_sahmr.json
│ │ │ │ ├── rich_motion_test.py
│ │ │ │ └── rich_utils.py
│ │ │ └── threedpw/
│ │ │ ├── threedpw_motion_test.py
│ │ │ ├── threedpw_motion_train.py
│ │ │ └── utils.py
│ │ ├── model/
│ │ │ ├── common_utils/
│ │ │ │ ├── optimizer.py
│ │ │ │ ├── scheduler.py
│ │ │ │ └── scheduler_cfg.py
│ │ │ └── gvhmr/
│ │ │ ├── callbacks/
│ │ │ │ ├── metric_3dpw.py
│ │ │ │ ├── metric_emdb.py
│ │ │ │ └── metric_rich.py
│ │ │ ├── gvhmr_pl.py
│ │ │ ├── gvhmr_pl_demo.py
│ │ │ ├── pipeline/
│ │ │ │ └── gvhmr_pipeline.py
│ │ │ └── utils/
│ │ │ ├── endecoder.py
│ │ │ ├── postprocess.py
│ │ │ └── stats_compose.py
│ │ ├── network/
│ │ │ ├── base_arch/
│ │ │ │ ├── embeddings/
│ │ │ │ │ └── rotary_embedding.py
│ │ │ │ └── transformer/
│ │ │ │ ├── encoder_rope.py
│ │ │ │ └── layer.py
│ │ │ ├── gvhmr/
│ │ │ │ └── relative_transformer.py
│ │ │ └── hmr2/
│ │ │ ├── __init__.py
│ │ │ ├── components/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pose_transformer.py
│ │ │ │ └── t_cond_mlp.py
│ │ │ ├── configs/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── model_config.yaml
│ │ │ │ └── smpl_mean_params.npz
│ │ │ ├── hmr2.py
│ │ │ ├── smpl_head.py
│ │ │ ├── utils/
│ │ │ │ ├── geometry.py
│ │ │ │ ├── preproc.py
│ │ │ │ └── smpl_wrapper.py
│ │ │ └── vit.py
│ │ └── utils/
│ │ ├── body_model/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── body_model.py
│ │ │ ├── body_model_smplh.py
│ │ │ ├── body_model_smplx.py
│ │ │ ├── coco_aug_dict.pth
│ │ │ ├── min_lbs.py
│ │ │ ├── seg_part_info.npy
│ │ │ ├── smpl_3dpw14_J_regressor_sparse.pt
│ │ │ ├── smpl_coco17_J_regressor.pt
│ │ │ ├── smpl_lite.py
│ │ │ ├── smpl_neutral_J_regressor.pt
│ │ │ ├── smpl_vert_segmentation.json
│ │ │ ├── smplx2smpl_sparse.pt
│ │ │ ├── smplx_lite.py
│ │ │ ├── smplx_verts437.pt
│ │ │ └── utils.py
│ │ ├── callbacks/
│ │ │ ├── lr_monitor.py
│ │ │ ├── prog_bar.py
│ │ │ ├── simple_ckpt_saver.py
│ │ │ └── train_speed_timer.py
│ │ ├── comm/
│ │ │ └── gather.py
│ │ ├── eval/
│ │ │ └── eval_utils.py
│ │ ├── geo/
│ │ │ ├── augment_noisy_pose.py
│ │ │ ├── flip_utils.py
│ │ │ ├── hmr_cam.py
│ │ │ ├── hmr_global.py
│ │ │ ├── quaternion.py
│ │ │ └── transforms.py
│ │ ├── geo_transform.py
│ │ ├── ik/
│ │ │ └── ccd_ik.py
│ │ ├── kpts/
│ │ │ └── kp2d_utils.py
│ │ ├── matrix.py
│ │ ├── net_utils.py
│ │ ├── preproc/
│ │ │ ├── __init__.py
│ │ │ ├── slam.py
│ │ │ ├── tracker.py
│ │ │ ├── vitfeat_extractor.py
│ │ │ ├── vitpose.py
│ │ │ └── vitpose_pytorch/
│ │ │ ├── __init__.py
│ │ │ └── src/
│ │ │ └── vitpose_infer/
│ │ │ ├── __init__.py
│ │ │ ├── builder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── backbones/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── alexnet.py
│ │ │ │ │ ├── cpm.py
│ │ │ │ │ ├── hourglass.py
│ │ │ │ │ ├── hourglass_ae.py
│ │ │ │ │ ├── hrformer.py
│ │ │ │ │ ├── litehrnet.py
│ │ │ │ │ ├── mobilenet_v2.py
│ │ │ │ │ ├── mobilenet_v3.py
│ │ │ │ │ ├── mspn.py
│ │ │ │ │ ├── regnet.py
│ │ │ │ │ ├── resnest.py
│ │ │ │ │ ├── resnext.py
│ │ │ │ │ ├── rsn.py
│ │ │ │ │ ├── scnet.py
│ │ │ │ │ ├── seresnet.py
│ │ │ │ │ ├── seresnext.py
│ │ │ │ │ ├── shufflenet_v1.py
│ │ │ │ │ ├── shufflenet_v2.py
│ │ │ │ │ ├── tcn.py
│ │ │ │ │ ├── test_torch.py
│ │ │ │ │ ├── utils/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── channel_shuffle.py
│ │ │ │ │ │ ├── inverted_residual.py
│ │ │ │ │ │ ├── make_divisible.py
│ │ │ │ │ │ ├── se_layer.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── vgg.py
│ │ │ │ │ ├── vipnas_mbv3.py
│ │ │ │ │ ├── vipnas_resnet.py
│ │ │ │ │ └── vit.py
│ │ │ │ ├── configs/
│ │ │ │ │ └── coco/
│ │ │ │ │ ├── ViTPose_base_coco_256x192.py
│ │ │ │ │ ├── ViTPose_base_simple_coco_256x192.py
│ │ │ │ │ ├── ViTPose_huge_coco_256x192.py
│ │ │ │ │ ├── ViTPose_huge_simple_coco_256x192.py
│ │ │ │ │ ├── ViTPose_large_coco_256x192.py
│ │ │ │ │ ├── ViTPose_large_simple_coco_256x192.py
│ │ │ │ │ └── __init__.py
│ │ │ │ ├── heads/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── deconv_head.py
│ │ │ │ │ ├── deeppose_regression_head.py
│ │ │ │ │ ├── hmr_head.py
│ │ │ │ │ ├── interhand_3d_head.py
│ │ │ │ │ ├── temporal_regression_head.py
│ │ │ │ │ ├── topdown_heatmap_base_head.py
│ │ │ │ │ ├── topdown_heatmap_multi_stage_head.py
│ │ │ │ │ ├── topdown_heatmap_simple_head.py
│ │ │ │ │ ├── vipnas_heatmap_simple_head.py
│ │ │ │ │ └── voxelpose_head.py
│ │ │ │ └── model_builder.py
│ │ │ ├── model_builder.py
│ │ │ └── pose_utils/
│ │ │ ├── ViTPose_trt.py
│ │ │ ├── __init__.py
│ │ │ ├── convert_to_trt.py
│ │ │ ├── general_utils.py
│ │ │ ├── inference_test.py
│ │ │ ├── logger_helper.py
│ │ │ ├── pose_utils.py
│ │ │ ├── pose_viz.py
│ │ │ ├── timerr.py
│ │ │ └── visualizer.py
│ │ ├── pylogger.py
│ │ ├── seq_utils.py
│ │ ├── smplx_utils.py
│ │ ├── video_io_utils.py
│ │ ├── vis/
│ │ │ ├── README.md
│ │ │ ├── cv2_utils.py
│ │ │ ├── renderer.py
│ │ │ ├── renderer_tools.py
│ │ │ ├── renderer_utils.py
│ │ │ └── rich_logger.py
│ │ └── wis3d_utils.py
│ ├── pyproject.toml
│ ├── pyrightconfig.json
│ ├── requirements.txt
│ ├── setup.py
│ └── tools/
│ ├── demo/
│ │ ├── colab_demo.ipynb
│ │ ├── demo.py
│ │ └── demo_folder.py
│ ├── eval_pose.py
│ ├── train.py
│ ├── unitest/
│ │ ├── make_hydra_cfg.py
│ │ └── run_dataset.py
│ └── video/
│ ├── merge_folder.py
│ ├── merge_horizontal.py
│ └── merge_vertical.py
└── common_metrics_on_video_quality/
├── .gitignore
├── README.md
├── calculate_clip.py
├── calculate_fvd.py
├── calculate_fvd_styleganv.py
├── calculate_lpips.py
├── calculate_psnr.py
├── calculate_ssim.py
├── download_eval_visual.sh
├── eval_prompts.json
└── eval_visual.sh
SYMBOL INDEX (1654 symbols across 163 files)
FILE: CogVideo/finetune/models/attention.py
function _chunked_feed_forward (line 29) | def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, ch...
class GatedSelfAttentionDense (line 45) | class GatedSelfAttentionDense(nn.Module):
method __init__ (line 56) | def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_h...
method forward (line 73) | def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
class JointTransformerBlock (line 87) | class JointTransformerBlock(nn.Module):
method __init__ (line 101) | def __init__(
method set_chunk_feed_forward (line 183) | def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int =...
method forward (line 188) | def forward(
class BasicTransformerBlock (line 252) | class BasicTransformerBlock(nn.Module):
method __init__ (line 287) | def __init__(
method set_chunk_feed_forward (line 455) | def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int =...
method forward (line 460) | def forward(
class LuminaFeedForward (line 583) | class LuminaFeedForward(nn.Module):
method __init__ (line 598) | def __init__(
method forward (line 629) | def forward(self, x):
class TemporalBasicTransformerBlock (line 634) | class TemporalBasicTransformerBlock(nn.Module):
method __init__ (line 646) | def __init__(
method set_chunk_feed_forward (line 699) | def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
method forward (line 705) | def forward(
class SkipFFTransformerBlock (line 763) | class SkipFFTransformerBlock(nn.Module):
method __init__ (line 764) | def __init__(
method forward (line 806) | def forward(self, hidden_states, encoder_hidden_states, cross_attentio...
class FreeNoiseTransformerBlock (line 836) | class FreeNoiseTransformerBlock(nn.Module):
method __init__ (line 891) | def __init__(
method _get_frame_indices (line 1005) | def _get_frame_indices(self, num_frames: int) -> List[Tuple[int, int]]:
method _get_frame_weights (line 1013) | def _get_frame_weights(self, num_frames: int, weighting_scheme: str = ...
method set_free_noise_properties (line 1045) | def set_free_noise_properties(
method set_chunk_feed_forward (line 1052) | def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int =...
method forward (line 1057) | def forward(
class FeedForward (line 1183) | class FeedForward(nn.Module):
method __init__ (line 1197) | def __init__(
method forward (line 1235) | def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> tor...
FILE: CogVideo/finetune/models/attention_processor.py
class Attention (line 41) | class Attention(nn.Module):
method __init__ (line 96) | def __init__(
method set_use_npu_flash_attention (line 271) | def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -...
method set_use_memory_efficient_attention_xformers (line 288) | def set_use_memory_efficient_attention_xformers(
method set_attention_slice (line 395) | def set_attention_slice(self, slice_size: int) -> None:
method set_processor (line 423) | def set_processor(self, processor: "AttnProcessor") -> None:
method get_processor (line 443) | def get_processor(self, return_deprecated_lora: bool = False) -> "Atte...
method forward (line 457) | def forward(
method batch_to_head_dim (line 503) | def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
method head_to_batch_dim (line 520) | def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) ->...
method get_attention_scores (line 547) | def get_attention_scores(
method prepare_attention_mask (line 594) | def prepare_attention_mask(
method norm_encoder_hidden_states (line 641) | def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tens...
method fuse_projections (line 671) | def fuse_projections(self, fuse=True):
class AttnProcessor (line 720) | class AttnProcessor:
method __call__ (line 725) | def __call__(
class CustomDiffusionAttnProcessor (line 792) | class CustomDiffusionAttnProcessor(nn.Module):
method __init__ (line 811) | def __init__(
method __call__ (line 837) | def __call__(
class AttnAddedKVProcessor (line 896) | class AttnAddedKVProcessor:
method __call__ (line 902) | def __call__(
class AttnAddedKVProcessor2_0 (line 963) | class AttnAddedKVProcessor2_0:
method __init__ (line 969) | def __init__(self):
method __call__ (line 975) | def __call__(
class JointAttnProcessor2_0 (line 1039) | class JointAttnProcessor2_0:
method __init__ (line 1042) | def __init__(self):
method __call__ (line 1046) | def __call__(
class PAGJointAttnProcessor2_0 (line 1125) | class PAGJointAttnProcessor2_0:
method __init__ (line 1128) | def __init__(self):
method __call__ (line 1134) | def __call__(
class PAGCFGJointAttnProcessor2_0 (line 1280) | class PAGCFGJointAttnProcessor2_0:
method __init__ (line 1283) | def __init__(self):
method __call__ (line 1289) | def __call__(
class FusedJointAttnProcessor2_0 (line 1445) | class FusedJointAttnProcessor2_0:
method __init__ (line 1448) | def __init__(self):
method __call__ (line 1452) | def __call__(
class AuraFlowAttnProcessor2_0 (line 1524) | class AuraFlowAttnProcessor2_0:
method __init__ (line 1527) | def __init__(self):
method __call__ (line 1533) | def __call__(
class FusedAuraFlowAttnProcessor2_0 (line 1617) | class FusedAuraFlowAttnProcessor2_0:
method __init__ (line 1620) | def __init__(self):
method __call__ (line 1626) | def __call__(
class FluxAttnProcessor2_0 (line 1714) | class FluxAttnProcessor2_0:
method __init__ (line 1717) | def __init__(self):
method __call__ (line 1721) | def __call__(
class FusedFluxAttnProcessor2_0 (line 1802) | class FusedFluxAttnProcessor2_0:
method __init__ (line 1805) | def __init__(self):
method __call__ (line 1811) | def __call__(
class CogVideoXAttnProcessor2_0 (line 1896) | class CogVideoXAttnProcessor2_0:
method __init__ (line 1902) | def __init__(self):
method __call__ (line 1906) | def __call__(
class FusedCogVideoXAttnProcessor2_0 (line 1985) | class FusedCogVideoXAttnProcessor2_0:
method __init__ (line 1991) | def __init__(self):
method __call__ (line 1995) | def __call__(
class XFormersAttnAddedKVProcessor (line 2056) | class XFormersAttnAddedKVProcessor:
method __init__ (line 2068) | def __init__(self, attention_op: Optional[Callable] = None):
method __call__ (line 2071) | def __call__(
class XFormersAttnProcessor (line 2127) | class XFormersAttnProcessor:
method __init__ (line 2139) | def __init__(self, attention_op: Optional[Callable] = None):
method __call__ (line 2142) | def __call__(
class AttnProcessorNPU (line 2221) | class AttnProcessorNPU:
method __init__ (line 2229) | def __init__(self):
method __call__ (line 2233) | def __call__(
class AttnProcessor2_0 (line 2330) | class AttnProcessor2_0:
method __init__ (line 2335) | def __init__(self):
method __call__ (line 2339) | def __call__(
class StableAudioAttnProcessor2_0 (line 2424) | class StableAudioAttnProcessor2_0:
method __init__ (line 2430) | def __init__(self):
method apply_partial_rotary_emb (line 2436) | def apply_partial_rotary_emb(
method __call__ (line 2451) | def __call__(
class HunyuanAttnProcessor2_0 (line 2555) | class HunyuanAttnProcessor2_0:
method __init__ (line 2561) | def __init__(self):
method __call__ (line 2565) | def __call__(
class FusedHunyuanAttnProcessor2_0 (line 2653) | class FusedHunyuanAttnProcessor2_0:
method __init__ (line 2660) | def __init__(self):
method __call__ (line 2666) | def __call__(
class PAGHunyuanAttnProcessor2_0 (line 2756) | class PAGHunyuanAttnProcessor2_0:
method __init__ (line 2763) | def __init__(self):
method __call__ (line 2769) | def __call__(
class PAGCFGHunyuanAttnProcessor2_0 (line 2879) | class PAGCFGHunyuanAttnProcessor2_0:
method __init__ (line 2886) | def __init__(self):
method __call__ (line 2892) | def __call__(
class LuminaAttnProcessor2_0 (line 3003) | class LuminaAttnProcessor2_0:
method __init__ (line 3009) | def __init__(self):
method __call__ (line 3013) | def __call__(
class FusedAttnProcessor2_0 (line 3099) | class FusedAttnProcessor2_0:
method __init__ (line 3112) | def __init__(self):
method __call__ (line 3118) | def __call__(
class CustomDiffusionXFormersAttnProcessor (line 3205) | class CustomDiffusionXFormersAttnProcessor(nn.Module):
method __init__ (line 3228) | def __init__(
method __call__ (line 3256) | def __call__(
class CustomDiffusionAttnProcessor2_0 (line 3321) | class CustomDiffusionAttnProcessor2_0(nn.Module):
method __init__ (line 3341) | def __init__(
method __call__ (line 3367) | def __call__(
class SlicedAttnProcessor (line 3435) | class SlicedAttnProcessor:
method __init__ (line 3445) | def __init__(self, slice_size: int):
method __call__ (line 3448) | def __call__(
class SlicedAttnAddedKVProcessor (line 3522) | class SlicedAttnAddedKVProcessor:
method __init__ (line 3532) | def __init__(self, slice_size):
method __call__ (line 3535) | def __call__(
class SpatialNorm (line 3614) | class SpatialNorm(nn.Module):
method __init__ (line 3625) | def __init__(
method forward (line 3635) | def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor:
class IPAdapterAttnProcessor (line 3643) | class IPAdapterAttnProcessor(nn.Module):
method __init__ (line 3658) | def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(...
method __call__ (line 3681) | def __call__(
class IPAdapterAttnProcessor2_0 (line 3841) | class IPAdapterAttnProcessor2_0(torch.nn.Module):
method __init__ (line 3856) | def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(...
method __call__ (line 3884) | def __call__(
class PAGIdentitySelfAttnProcessor2_0 (line 4071) | class PAGIdentitySelfAttnProcessor2_0:
method __init__ (line 4077) | def __init__(self):
method __call__ (line 4083) | def __call__(
class PAGCFGIdentitySelfAttnProcessor2_0 (line 4170) | class PAGCFGIdentitySelfAttnProcessor2_0:
method __init__ (line 4176) | def __init__(self):
method __call__ (line 4182) | def __call__(
class LoRAAttnProcessor (line 4273) | class LoRAAttnProcessor:
method __init__ (line 4274) | def __init__(self):
class LoRAAttnProcessor2_0 (line 4278) | class LoRAAttnProcessor2_0:
method __init__ (line 4279) | def __init__(self):
class LoRAXFormersAttnProcessor (line 4283) | class LoRAXFormersAttnProcessor:
method __init__ (line 4284) | def __init__(self):
class LoRAAttnAddedKVProcessor (line 4288) | class LoRAAttnAddedKVProcessor:
method __init__ (line 4289) | def __init__(self):
class FluxSingleAttnProcessor2_0 (line 4293) | class FluxSingleAttnProcessor2_0(FluxAttnProcessor2_0):
method __init__ (line 4298) | def __init__(self):
FILE: CogVideo/finetune/models/cogvideox_transformer_3d.py
class CogVideoXBlock (line 36) | class CogVideoXBlock(nn.Module):
method __init__ (line 71) | def __init__(
method forward (line 134) | def forward(
class CogVideoXTransformer3DModel (line 204) | class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMi...
method __init__ (line 261) | def __init__(
method _set_gradient_checkpointing (line 356) | def _set_gradient_checkpointing(self, module, value=False):
method attn_processors (line 361) | def attn_processors(self) -> Dict[str, AttentionProcessor]:
method set_attn_processor (line 385) | def set_attn_processor(self, processor: Union[AttentionProcessor, Dict...
method fuse_qkv_projections (line 420) | def fuse_qkv_projections(self):
method unfuse_qkv_projections (line 446) | def unfuse_qkv_projections(self):
method forward (line 459) | def forward(
FILE: CogVideo/finetune/models/embeddings.py
function get_timestep_embedding (line 29) | def get_timestep_embedding(
function get_3d_sincos_pos_embed (line 83) | def get_3d_sincos_pos_embed(
function get_2d_sincos_pos_embed (line 130) | def get_2d_sincos_pos_embed(
function get_2d_sincos_pos_embed_from_grid (line 152) | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
function get_1d_sincos_pos_embed_from_grid (line 164) | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
class PatchEmbed (line 185) | class PatchEmbed(nn.Module):
method __init__ (line 188) | def __init__(
method cropped_pos_embed (line 239) | def cropped_pos_embed(self, height, width):
method forward (line 262) | def forward(self, latent):
class LuminaPatchEmbed (line 293) | class LuminaPatchEmbed(nn.Module):
method __init__ (line 296) | def __init__(self, patch_size=2, in_channels=4, embed_dim=768, bias=Tr...
method forward (line 305) | def forward(self, x, freqs_cis):
class CogVideoXPatchEmbed (line 339) | class CogVideoXPatchEmbed(nn.Module):
method __init__ (line 340) | def __init__(
method _get_positional_embeddings (line 381) | def _get_positional_embeddings(self, sample_height: int, sample_width:...
method forward (line 402) | def forward(self, empty_text_embeds: torch.Tensor, text_embeds: torch....
class CogView3PlusPatchEmbed (line 455) | class CogView3PlusPatchEmbed(nn.Module):
method __init__ (line 456) | def __init__(
method forward (line 480) | def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: ...
function get_3d_rotary_pos_embed (line 509) | def get_3d_rotary_pos_embed(
function get_2d_rotary_pos_embed (line 577) | def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real...
function get_2d_rotary_pos_embed_from_grid (line 605) | def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
function get_2d_rotary_pos_embed_lumina (line 625) | def get_2d_rotary_pos_embed_lumina(embed_dim, len_h, len_w, linear_facto...
function get_1d_rotary_pos_embed (line 641) | def get_1d_rotary_pos_embed(
function apply_rotary_emb (line 707) | def apply_rotary_emb(
class FluxPosEmbed (line 756) | class FluxPosEmbed(nn.Module):
method __init__ (line 758) | def __init__(self, theta: int, axes_dim: List[int]):
method forward (line 763) | def forward(self, ids: torch.Tensor) -> torch.Tensor:
class TimestepEmbedding (line 781) | class TimestepEmbedding(nn.Module):
method __init__ (line 782) | def __init__(
method forward (line 814) | def forward(self, sample, condition=None):
class Timesteps (line 829) | class Timesteps(nn.Module):
method __init__ (line 830) | def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale...
method forward (line 837) | def forward(self, timesteps):
class GaussianFourierProjection (line 848) | class GaussianFourierProjection(nn.Module):
method __init__ (line 851) | def __init__(
method forward (line 866) | def forward(self, x):
class SinusoidalPositionalEmbedding (line 879) | class SinusoidalPositionalEmbedding(nn.Module):
method __init__ (line 891) | def __init__(self, embed_dim: int, max_seq_length: int = 32):
method forward (line 900) | def forward(self, x):
class ImagePositionalEmbeddings (line 906) | class ImagePositionalEmbeddings(nn.Module):
method __init__ (line 930) | def __init__(
method forward (line 948) | def forward(self, index):
class LabelEmbedding (line 971) | class LabelEmbedding(nn.Module):
method __init__ (line 981) | def __init__(self, num_classes, hidden_size, dropout_prob):
method token_drop (line 988) | def token_drop(self, labels, force_drop_ids=None):
method forward (line 999) | def forward(self, labels: torch.LongTensor, force_drop_ids=None):
class TextImageProjection (line 1007) | class TextImageProjection(nn.Module):
method __init__ (line 1008) | def __init__(
method forward (line 1021) | def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
class ImageProjection (line 1034) | class ImageProjection(nn.Module):
method __init__ (line 1035) | def __init__(
method forward (line 1047) | def forward(self, image_embeds: torch.Tensor):
class IPAdapterFullImageProjection (line 1057) | class IPAdapterFullImageProjection(nn.Module):
method __init__ (line 1058) | def __init__(self, image_embed_dim=1024, cross_attention_dim=1024):
method forward (line 1065) | def forward(self, image_embeds: torch.Tensor):
class IPAdapterFaceIDImageProjection (line 1069) | class IPAdapterFaceIDImageProjection(nn.Module):
method __init__ (line 1070) | def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mul...
method forward (line 1079) | def forward(self, image_embeds: torch.Tensor):
class CombinedTimestepLabelEmbeddings (line 1085) | class CombinedTimestepLabelEmbeddings(nn.Module):
method __init__ (line 1086) | def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
method forward (line 1093) | def forward(self, timestep, class_labels, hidden_dtype=None):
class CombinedTimestepTextProjEmbeddings (line 1104) | class CombinedTimestepTextProjEmbeddings(nn.Module):
method __init__ (line 1105) | def __init__(self, embedding_dim, pooled_projection_dim):
method forward (line 1112) | def forward(self, timestep, pooled_projection):
class CombinedTimestepGuidanceTextProjEmbeddings (line 1123) | class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
method __init__ (line 1124) | def __init__(self, embedding_dim, pooled_projection_dim):
method forward (line 1132) | def forward(self, timestep, guidance, pooled_projection):
class CogView3CombinedTimestepSizeEmbeddings (line 1147) | class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
method __init__ (line 1148) | def __init__(self, embedding_dim: int, condition_dim: int, pooled_proj...
method forward (line 1156) | def forward(
class HunyuanDiTAttentionPool (line 1180) | class HunyuanDiTAttentionPool(nn.Module):
method __init__ (line 1183) | def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, o...
method forward (line 1192) | def forward(self, x):
class HunyuanCombinedTimestepTextSizeStyleEmbedding (line 1220) | class HunyuanCombinedTimestepTextSizeStyleEmbedding(nn.Module):
method __init__ (line 1221) | def __init__(
method forward (line 1255) | def forward(self, timestep, encoder_hidden_states, image_meta_size, st...
class LuminaCombinedTimestepCaptionEmbedding (line 1281) | class LuminaCombinedTimestepCaptionEmbedding(nn.Module):
method __init__ (line 1282) | def __init__(self, hidden_size=4096, cross_attention_dim=2048, frequen...
method forward (line 1299) | def forward(self, timestep, caption_feat, caption_mask):
class TextTimeEmbedding (line 1315) | class TextTimeEmbedding(nn.Module):
method __init__ (line 1316) | def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: i...
method forward (line 1323) | def forward(self, hidden_states):
class TextImageTimeEmbedding (line 1331) | class TextImageTimeEmbedding(nn.Module):
method __init__ (line 1332) | def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 7...
method forward (line 1338) | def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
class ImageTimeEmbedding (line 1349) | class ImageTimeEmbedding(nn.Module):
method __init__ (line 1350) | def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1...
method forward (line 1355) | def forward(self, image_embeds: torch.Tensor):
class ImageHintTimeEmbedding (line 1362) | class ImageHintTimeEmbedding(nn.Module):
method __init__ (line 1363) | def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1...
method forward (line 1385) | def forward(self, image_embeds: torch.Tensor, hint: torch.Tensor):
class AttentionPooling (line 1393) | class AttentionPooling(nn.Module):
method __init__ (line 1396) | def __init__(self, num_heads, embed_dim, dtype=None):
method forward (line 1406) | def forward(self, x):
function get_fourier_embeds_from_boundingbox (line 1443) | def get_fourier_embeds_from_boundingbox(embed_dim, box):
class GLIGENTextBoundingboxProjection (line 1464) | class GLIGENTextBoundingboxProjection(nn.Module):
method __init__ (line 1465) | def __init__(self, positive_len, out_dim, feature_type="text-only", fo...
method forward (line 1506) | def forward(
class PixArtAlphaCombinedTimestepSizeEmbeddings (line 1557) | class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
method __init__ (line 1565) | def __init__(self, embedding_dim, size_emb_dim, use_additional_conditi...
method forward (line 1578) | def forward(self, timestep, resolution, aspect_ratio, batch_size, hidd...
class PixArtAlphaTextProjection (line 1594) | class PixArtAlphaTextProjection(nn.Module):
method __init__ (line 1601) | def __init__(self, in_features, hidden_size, out_features=None, act_fn...
method forward (line 1616) | def forward(self, caption):
class IPAdapterPlusImageProjectionBlock (line 1623) | class IPAdapterPlusImageProjectionBlock(nn.Module):
method __init__ (line 1624) | def __init__(
method forward (line 1647) | def forward(self, x, latents, residual):
class IPAdapterPlusImageProjection (line 1656) | class IPAdapterPlusImageProjection(nn.Module):
method __init__ (line 1672) | def __init__(
method forward (line 1695) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class IPAdapterFaceIDPlusImageProjection (line 1715) | class IPAdapterFaceIDPlusImageProjection(nn.Module):
method __init__ (line 1732) | def __init__(
method forward (line 1767) | def forward(self, id_embeds: torch.Tensor) -> torch.Tensor:
class MultiIPAdapterImageProjection (line 1795) | class MultiIPAdapterImageProjection(nn.Module):
method __init__ (line 1796) | def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Modul...
method forward (line 1800) | def forward(self, image_embeds: List[torch.Tensor]):
FILE: CogVideo/finetune/models/pipeline_cogvideox.py
function get_resize_crop_region_for_grid (line 68) | def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
function retrieve_timesteps (line 87) | def retrieve_timesteps(
class CogVideoXPipeline (line 146) | class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
method __init__ (line 178) | def __init__(
method _get_t5_prompt_embeds (line 203) | def _get_t5_prompt_embeds(
method encode_prompt (line 248) | def encode_prompt(
method prepare_latents (line 329) | def prepare_latents(
method decode_latents (line 355) | def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
method prepare_extra_step_kwargs (line 363) | def prepare_extra_step_kwargs(self, generator, eta):
method check_inputs (line 381) | def check_inputs(
method fuse_qkv_projections (line 432) | def fuse_qkv_projections(self) -> None:
method unfuse_qkv_projections (line 437) | def unfuse_qkv_projections(self) -> None:
method _prepare_rotary_positional_embeddings (line 445) | def _prepare_rotary_positional_embeddings(
method guidance_scale (line 472) | def guidance_scale(self):
method num_timesteps (line 476) | def num_timesteps(self):
method attention_kwargs (line 480) | def attention_kwargs(self):
method interrupt (line 484) | def interrupt(self):
method __call__ (line 489) | def __call__(
FILE: CogVideo/finetune/models/pipeline_output.py
class CogVideoXPipelineOutput (line 9) | class CogVideoXPipelineOutput(BaseOutput):
FILE: CogVideo/finetune/models/utils.py
function lora_state_dict (line 55) | def lora_state_dict(
FILE: CogVideo/finetune/train_cogvideox_injector.py
function get_args (line 59) | def get_args():
function parse_matrix (line 412) | def parse_matrix(matrix_str):
class VideoDataset (line 420) | class VideoDataset(Dataset):
method __init__ (line 421) | def __init__(
method __len__ (line 537) | def __len__(self):
method save_images2video (line 540) | def save_images2video(self, images, video_name):
method __getitem__ (line 564) | def __getitem__(self, idx):
method load_sceneposes (line 671) | def load_sceneposes(self, objs_file, obj_idx, obj_transl):
function save_model_card (line 683) | def save_model_card(
function log_validation (line 763) | def log_validation(
function _get_t5_prompt_embeds (line 828) | def _get_t5_prompt_embeds(
function encode_prompt (line 868) | def encode_prompt(
function compute_prompt_embeddings (line 892) | def compute_prompt_embeddings(
function prepare_rotary_positional_embeddings (line 919) | def prepare_rotary_positional_embeddings(
function get_optimizer (line 948) | def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
function main (line 1029) | def main(args):
FILE: CogVideo/finetune/train_cogvideox_lora.py
function get_args (line 56) | def get_args():
function parse_matrix (line 393) | def parse_matrix(matrix_str):
class VideoDataset (line 401) | class VideoDataset(Dataset):
method __init__ (line 402) | def __init__(
method __len__ (line 518) | def __len__(self):
method save_images2video (line 521) | def save_images2video(self, images, video_name):
method __getitem__ (line 545) | def __getitem__(self, idx):
method load_sceneposes (line 650) | def load_sceneposes(self, objs_file, obj_idx, obj_transl):
function save_model_card (line 662) | def save_model_card(
function log_validation (line 742) | def log_validation(
function _get_t5_prompt_embeds (line 807) | def _get_t5_prompt_embeds(
function encode_prompt (line 845) | def encode_prompt(
function compute_prompt_embeddings (line 869) | def compute_prompt_embeddings(
function prepare_rotary_positional_embeddings (line 896) | def prepare_rotary_positional_embeddings(
function get_optimizer (line 925) | def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
function main (line 1007) | def main(args):
FILE: CogVideo/inference/3dtrajmaster_inference.py
function parse_matrix (line 32) | def parse_matrix(matrix_str):
function load_sceneposes (line 40) | def load_sceneposes(objs_file, obj_idx, obj_transl):
function get_pose_embeds (line 51) | def get_pose_embeds(scene, video_name, instance_data_root, locations_inf...
function init_cam_poses (line 90) | def init_cam_poses(instance_data_root):
function generate_video (line 104) | def generate_video(
FILE: CogVideo/tools/caption/video_caption.py
function load_video (line 20) | def load_video(video_data, strategy='chat'):
function predict (line 64) | def predict(prompt, video_data, temperature):
function test (line 99) | def test():
FILE: CogVideo/tools/convert_weight_sat2hf.py
function reassign_query_key_value_inplace (line 35) | def reassign_query_key_value_inplace(key: str, state_dict: Dict[str, Any]):
function reassign_query_key_layernorm_inplace (line 46) | def reassign_query_key_layernorm_inplace(key: str, state_dict: Dict[str,...
function reassign_adaln_norm_inplace (line 57) | def reassign_adaln_norm_inplace(key: str, state_dict: Dict[str, Any]):
function remove_keys_inplace (line 73) | def remove_keys_inplace(key: str, state_dict: Dict[str, Any]):
function replace_up_keys_inplace (line 77) | def replace_up_keys_inplace(key: str, state_dict: Dict[str, Any]):
function get_state_dict (line 140) | def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
function update_state_dict_inplace (line 151) | def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, ...
function convert_transformer (line 155) | def convert_transformer(
function convert_vae (line 189) | def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype):
function get_args (line 209) | def get_args():
FILE: CogVideo/tools/export_sat_lora_weight.py
function get_state_dict (line 8) | def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
function export_lora_weight (line 38) | def export_lora_weight(ckpt_path,lora_save_directory):
function get_args (line 71) | def get_args():
FILE: CogVideo/tools/llm_flux_cogvideox/gradio_page.py
function generate_caption (line 81) | def generate_caption(prompt):
function generate_image (line 101) | def generate_image(caption, progress=gr.Progress(track_tqdm=True)):
function generate_video (line 112) | def generate_video(
function save_video (line 134) | def save_video(tensor):
function convert_to_gif (line 142) | def convert_to_gif(video_path):
function delete_old_files (line 151) | def delete_old_files():
FILE: CogVideo/tools/llm_flux_cogvideox/llm_flux_cogvideox.py
function get_args (line 62) | def get_args():
function reset_memory (line 142) | def reset_memory():
function main (line 150) | def main(args: Dict[str, Any]) -> None:
FILE: CogVideo/tools/load_cogvideox_lora.py
function get_args (line 32) | def get_args():
FILE: CogVideo/tools/parallel_inference/parallel_inference_xdit.py
function main (line 42) | def main():
FILE: CogVideo/tools/replicate/predict_i2v.py
function download_weights (line 26) | def download_weights(url, dest):
class Predictor (line 34) | class Predictor(BasePredictor):
method setup (line 35) | def setup(self) -> None:
method predict (line 49) | def predict(
FILE: CogVideo/tools/replicate/predict_t2v.py
function download_weights (line 26) | def download_weights(url, dest):
class Predictor (line 34) | class Predictor(BasePredictor):
method setup (line 35) | def setup(self) -> None:
method predict (line 50) | def predict(
FILE: dataset/utils.py
function get_camera_frustum (line 28) | def get_camera_frustum(img_size, K, W2C, frustum_length=0.5, color=[0., ...
function frustums2lineset (line 55) | def frustums2lineset(frustums):
function visualize_cameras (line 73) | def visualize_cameras(colored_camera_dicts, sphere_radius, camera_size=0...
function parse_matrix (line 110) | def parse_matrix(matrix_str):
function load_sceneposes (line 118) | def load_sceneposes(objs_file, obj_idx, obj_transl):
function save_images2video (line 129) | def save_images2video(images, video_name, fps):
function normalize (line 154) | def normalize(x):
function viewmatrix (line 157) | def viewmatrix(z, up, pos):
function matrix_to_euler_angles (line 165) | def matrix_to_euler_angles(matrix):
function eul2rot (line 180) | def eul2rot(theta) :
function extract_location_rotation (line 188) | def extract_location_rotation(data):
function get_cam_points_vis (line 200) | def get_cam_points_vis(W, H, intrinsics, ext_pose, color,frustum_length):
function batch_axis_angle_to_rotation_matrix (line 212) | def batch_axis_angle_to_rotation_matrix(r_batch):
FILE: eval/GVHMR/hmr4d/__init__.py
function os_chdir_to_proj_root (line 7) | def os_chdir_to_proj_root():
FILE: eval/GVHMR/hmr4d/build_gvhmr.py
function build_gvhmr_demo (line 7) | def build_gvhmr_demo():
FILE: eval/GVHMR/hmr4d/configs/__init__.py
function register_store_gvhmr (line 14) | def register_store_gvhmr():
function parse_args_to_cfg (line 19) | def parse_args_to_cfg():
FILE: eval/GVHMR/hmr4d/datamodule/mocap_trainX_testY.py
function collate_fn (line 17) | def collate_fn(batch):
class DataModule (line 33) | class DataModule(pl.LightningDataModule):
method __init__ (line 34) | def __init__(self, dataset_opts: DictConfig, loader_opts: DictConfig, ...
method train_dataloader (line 82) | def train_dataloader(self):
method val_dataloader (line 96) | def val_dataloader(self):
method test_dataloader (line 114) | def test_dataloader(self):
FILE: eval/GVHMR/hmr4d/dataset/bedlam/bedlam.py
class BedlamDatasetV2 (line 22) | class BedlamDatasetV2(ImgfeatMotionDatasetBase):
method __init__ (line 30) | def __init__(
method _load_dataset (line 50) | def _load_dataset(self):
method _get_idx2meta (line 80) | def _get_idx2meta(self):
method _load_data (line 85) | def _load_data(self, idx):
method _process_data (line 128) | def _process_data(self, data, idx):
FILE: eval/GVHMR/hmr4d/dataset/bedlam/utils.py
function mid2vname (line 8) | def mid2vname(mid):
function mid2featname (line 18) | def mid2featname(mid):
function featname2mid (line 28) | def featname2mid(featname):
function load_vname2lwh (line 38) | def load_vname2lwh():
FILE: eval/GVHMR/hmr4d/dataset/emdb/emdb_motion_test.py
class EmdbSmplFullSeqDataset (line 21) | class EmdbSmplFullSeqDataset(data.Dataset):
method __init__ (line 22) | def __init__(self, split=1, flip_test=False):
method __len__ (line 52) | def __len__(self):
method _load_data (line 55) | def _load_data(self, idx):
method _process_data (line 136) | def _process_data(self, data):
method __getitem__ (line 141) | def __getitem__(self, idx):
FILE: eval/GVHMR/hmr4d/dataset/emdb/utils.py
function name_to_subfolder (line 10) | def name_to_subfolder(name):
function name_to_local_pkl_path (line 14) | def name_to_local_pkl_path(name):
function load_raw_pkl (line 18) | def load_raw_pkl(fp):
function load_pkl (line 24) | def load_pkl(fp):
function _check_annot (line 103) | def _check_annot(emdb_raw_dir=Path("inputs/EMDB/EMDB")):
function _check_length (line 110) | def _check_length(emdb_raw_dir=Path("inputs/EMDB/EMDB"), emdb_hmr4d_supp...
FILE: eval/GVHMR/hmr4d/dataset/h36m/h36m.py
class H36mSmplDataset (line 23) | class H36mSmplDataset(ImgfeatMotionDatasetBase):
method __init__ (line 24) | def __init__(
method _load_dataset (line 43) | def _load_dataset(self):
method _get_idx2meta (line 69) | def _get_idx2meta(self):
method _load_data (line 82) | def _load_data(self, idx):
method _process_data (line 119) | def _process_data(self, data, idx):
FILE: eval/GVHMR/hmr4d/dataset/h36m/utils.py
function get_vid (line 13) | def get_vid(pkl_path, cam_id):
function get_raw_pkl_paths (line 20) | def get_raw_pkl_paths(h36m_raw_root):
function get_cam_KRts (line 31) | def get_cam_KRts():
function parse_raw_pkl (line 60) | def parse_raw_pkl(pkl_path, to_50hz=True):
FILE: eval/GVHMR/hmr4d/dataset/imgfeat_motion/base_dataset.py
class ImgfeatMotionDatasetBase (line 8) | class ImgfeatMotionDatasetBase(data.Dataset):
method __init__ (line 9) | def __init__(self):
method __len__ (line 14) | def __len__(self):
method _load_dataset (line 17) | def _load_dataset(self):
method _get_idx2meta (line 20) | def _get_idx2meta(self):
method _load_data (line 23) | def _load_data(self, idx):
method _process_data (line 26) | def _process_data(self, data, idx):
method __getitem__ (line 29) | def __getitem__(self, idx):
FILE: eval/GVHMR/hmr4d/dataset/pure_motion/amass.py
class AmassDataset (line 16) | class AmassDataset(BaseDataset):
method __init__ (line 17) | def __init__(
method _load_dataset (line 34) | def _load_dataset(self):
method _get_idx2meta (line 54) | def _get_idx2meta(self):
method _load_data (line 76) | def _load_data(self, idx):
FILE: eval/GVHMR/hmr4d/dataset/pure_motion/base_dataset.py
class BaseDataset (line 16) | class BaseDataset(Dataset):
method __init__ (line 17) | def __init__(self, cam_augmentation, limit_size=None):
method _load_dataset (line 27) | def _load_dataset(self):
method _get_idx2meta (line 30) | def _get_idx2meta(self):
method __len__ (line 34) | def __len__(self):
method _load_data (line 39) | def _load_data(self, idx):
method _process_data (line 42) | def _process_data(self, data, idx):
method __getitem__ (line 179) | def __getitem__(self, idx):
FILE: eval/GVHMR/hmr4d/dataset/pure_motion/cam_traj_utils.py
function noisy_interpolation (line 21) | def noisy_interpolation(x, length, step_noise_perc=0.2):
function noisy_impluse_interpolation (line 44) | def noisy_impluse_interpolation(data1, data2, step_noise_perc=0.2):
function create_camera (line 62) | def create_camera(w_root, cfg):
function create_rotation_move (line 113) | def create_rotation_move(R, length, r_xyz_w_std=[np.pi / 8, np.pi / 4, n...
function create_translation_move (line 134) | def create_translation_move(R_w2c, t_w2c, length, t_xyz_w_std=[1.0, 0.25...
class CameraAugmentorV11 (line 151) | class CameraAugmentorV11:
method __init__ (line 191) | def __init__(self):
method create_rotation_track (line 196) | def create_rotation_track(self, cam_mat, root, rx_factor=1.0, ry_facto...
method create_translation_track (line 220) | def create_translation_track(self, cam_mat, root, t_factor=1.0, tz_bia...
method add_stepnoise (line 242) | def add_stepnoise(self, R, T):
method __call__ (line 333) | def __call__(self, w_j3d, length=120):
FILE: eval/GVHMR/hmr4d/dataset/pure_motion/utils.py
function aa_to_r6d (line 12) | def aa_to_r6d(x):
function r6d_to_aa (line 16) | def r6d_to_aa(x):
function interpolate_smpl_params (line 20) | def interpolate_smpl_params(smpl_params, tgt_len):
function rotate_around_axis (line 51) | def rotate_around_axis(global_orient, transl, axis="y"):
function augment_betas (line 63) | def augment_betas(betas, std=0.1):
FILE: eval/GVHMR/hmr4d/dataset/rich/rich_motion_test.py
class RichSmplFullSeqDataset (line 35) | class RichSmplFullSeqDataset(data.Dataset):
method __init__ (line 36) | def __init__(self, vid_presets=None):
method __len__ (line 68) | def __len__(self):
method _load_data (line 71) | def _load_data(self, idx):
method _process_data (line 120) | def _process_data(self, data):
method __getitem__ (line 167) | def __getitem__(self, idx):
function select_subset (line 173) | def select_subset(labels, vid_presets):
FILE: eval/GVHMR/hmr4d/dataset/rich/rich_utils.py
function sample_idx2meta (line 12) | def sample_idx2meta(idx2meta, sample_interval):
function remove_bbx_invisible_frame (line 27) | def remove_bbx_invisible_frame(idx2meta, img2gtbbx):
function remove_extra_rules (line 44) | def remove_extra_rules(idx2meta):
function compute_bbx (line 53) | def compute_bbx(dataset, data):
function get_2d (line 82) | def get_2d(dataset, data):
function squared_crop_and_resize (line 95) | def squared_crop_and_resize(dataset, img, bbx_lurb, dst_size=224, state=...
function get_augmented_square_bbx (line 124) | def get_augmented_square_bbx(bbx_lurb, per_shift=0.1, per_zoomout=0.2, b...
function get_squared_bbx_region_and_resize (line 152) | def get_squared_bbx_region_and_resize(frames, bbx_xys, dst_size=224):
function extract_cam_xml (line 182) | def extract_cam_xml(xml_path="", dtype=torch.float32):
function get_cam2params (line 198) | def get_cam2params(scene_info_root=None):
function get_w2az_sahmr (line 223) | def get_w2az_sahmr():
function has_multi_persons (line 235) | def has_multi_persons(seq_name):
function parse_seqname_info (line 243) | def parse_seqname_info(skip_multi_persons=True):
function get_seqnames_of_split (line 269) | def get_seqnames_of_split(splits=["train"], skip_multi_persons=True):
function get_seqname_to_imgrange (line 284) | def get_seqname_to_imgrange():
function get_img_key (line 306) | def get_img_key(seq_name, cam_id, f_id):
function get_seq_cam_fn (line 312) | def get_seq_cam_fn(img_root, seq_name, cam_id):
function get_img_fn (line 322) | def get_img_fn(img_root, seq_name, cam_id, f_id):
function get_cam_key_wham_vid (line 336) | def get_cam_key_wham_vid(vid):
function get_K_wham_vid (line 344) | def get_K_wham_vid(vid):
class RichVid2Tc2az (line 351) | class RichVid2Tc2az:
method __init__ (line 352) | def __init__(self) -> None:
method __call__ (line 358) | def __call__(self, vid):
method get_T_w2az (line 366) | def get_T_w2az(self, vid):
FILE: eval/GVHMR/hmr4d/dataset/threedpw/threedpw_motion_test.py
class ThreedpwSmplFullSeqDataset (line 17) | class ThreedpwSmplFullSeqDataset(data.Dataset):
method __init__ (line 18) | def __init__(self, flip_test=False, skip_invalid=False):
method __len__ (line 42) | def __len__(self):
method _load_data (line 45) | def _load_data(self, idx):
method _process_data (line 118) | def _process_data(self, data):
method __getitem__ (line 137) | def __getitem__(self, idx):
FILE: eval/GVHMR/hmr4d/dataset/threedpw/threedpw_motion_train.py
class ThreedpwSmplDataset (line 20) | class ThreedpwSmplDataset(ImgfeatMotionDatasetBase):
method __init__ (line 21) | def __init__(self):
method _load_dataset (line 31) | def _load_dataset(self):
method _get_idx2meta (line 47) | def _get_idx2meta(self):
method _load_data (line 64) | def _load_data(self, idx):
method _process_data (line 98) | def _process_data(self, data, idx):
FILE: eval/GVHMR/hmr4d/dataset/threedpw/utils.py
function read_raw_pkl (line 12) | def read_raw_pkl(pkl_path):
function load_and_convert_wham_pth (line 48) | def load_and_convert_wham_pth(pth):
function na_cam_param_to_K_fullimg (line 77) | def na_cam_param_to_K_fullimg(cam_param):
FILE: eval/GVHMR/hmr4d/model/common_utils/scheduler.py
class WarmupMultiStepLR (line 5) | class WarmupMultiStepLR(torch.optim.lr_scheduler.LRScheduler):
method __init__ (line 6) | def __init__(self, optimizer, milestones, warmup=0, gamma=0.1, last_ep...
method get_lr (line 14) | def get_lr(self):
FILE: eval/GVHMR/hmr4d/model/common_utils/scheduler_cfg.py
function epoch_half_by (line 10) | def epoch_half_by(milestones=[100, 200, 300]):
function warmup_epoch_half_by (line 31) | def warmup_epoch_half_by(warmup=10, milestones=[100, 200, 300]):
FILE: eval/GVHMR/hmr4d/model/gvhmr/callbacks/metric_3dpw.py
class MetricMocap (line 19) | class MetricMocap(pl.Callback):
method __init__ (line 20) | def __init__(self):
method on_predict_batch_end (line 46) | def on_predict_batch_end(self, trainer, pl_module, outputs, batch, bat...
method on_predict_epoch_end (line 136) | def on_predict_epoch_end(self, trainer, pl_module):
FILE: eval/GVHMR/hmr4d/model/gvhmr/callbacks/metric_emdb.py
class MetricMocap (line 32) | class MetricMocap(pl.Callback):
method __init__ (line 33) | def __init__(self, emdb_split=1):
method on_predict_batch_end (line 76) | def on_predict_batch_end(self, trainer, pl_module, outputs, batch, bat...
method on_predict_epoch_end (line 267) | def on_predict_epoch_end(self, trainer, pl_module):
FILE: eval/GVHMR/hmr4d/model/gvhmr/callbacks/metric_rich.py
class MetricMocap (line 36) | class MetricMocap(pl.Callback):
method __init__ (line 37) | def __init__(self):
method on_predict_batch_end (line 81) | def on_predict_batch_end(self, trainer, pl_module, outputs, batch, bat...
method on_predict_epoch_end (line 339) | def on_predict_epoch_end(self, trainer, pl_module):
FILE: eval/GVHMR/hmr4d/model/gvhmr/gvhmr_pl.py
class GvhmrPL (line 29) | class GvhmrPL(pl.LightningModule):
method __init__ (line 30) | def __init__(
method training_step (line 51) | def training_step(self, batch, batch_idx):
method validation_step (line 158) | def validation_step(self, batch, batch_idx, dataloader_idx=0):
method configure_optimizers (line 277) | def configure_optimizers(self):
method on_save_checkpoint (line 292) | def on_save_checkpoint(self, checkpoint) -> None:
method load_pretrained_model (line 299) | def load_pretrained_model(self, ckpt_path):
FILE: eval/GVHMR/hmr4d/model/gvhmr/gvhmr_pl_demo.py
class DemoPL (line 10) | class DemoPL(pl.LightningModule):
method __init__ (line 11) | def __init__(self, pipeline):
method predict (line 16) | def predict(self, data, static_cam=False):
method load_pretrained_model (line 48) | def load_pretrained_model(self, ckpt_path):
FILE: eval/GVHMR/hmr4d/model/gvhmr/pipeline/gvhmr_pipeline.py
class Pipeline (line 35) | class Pipeline(nn.Module):
method __init__ (line 36) | def __init__(self, args, args_denoiser3d, **kwargs):
method forward (line 54) | def forward(self, inputs, train=False, postproc=False, static_cam=False):
function randomly_set_null_condition (line 138) | def randomly_set_null_condition(f_condition, uncond_prob=0.1):
function compute_extra_incam_loss (line 150) | def compute_extra_incam_loss(inputs, outputs, ppl):
function compute_extra_global_loss (line 279) | def compute_extra_global_loss(inputs, outputs, ppl):
function get_smpl_params_w_Rt_v2 (line 322) | def get_smpl_params_w_Rt_v2(
FILE: eval/GVHMR/hmr4d/model/gvhmr/utils/endecoder.py
class EnDecoder (line 20) | class EnDecoder(nn.Module):
method __init__ (line 21) | def __init__(self, stats_name="DEFAULT_01", noise_pose_k=10):
method get_noisyobs (line 38) | def get_noisyobs(self, data, return_type="r6d"):
method normalize_body_pose_r6d (line 57) | def normalize_body_pose_r6d(self, body_pose_r6d):
method fk_v2 (line 66) | def fk_v2(self, body_pose, betas, global_orient=None, transl=None, get...
method get_local_pos (line 96) | def get_local_pos(self, betas):
method encode (line 102) | def encode(self, inputs):
method encode_translw (line 139) | def encode_translw(self, inputs):
method decode_translw (line 158) | def decode_translw(self, x_norm):
method decode (line 161) | def decode(self, x_norm):
FILE: eval/GVHMR/hmr4d/model/gvhmr/utils/postprocess.py
function pp_static_joint (line 20) | def pp_static_joint(outputs, endecoder: EnDecoder):
function pp_static_joint_cam (line 62) | def pp_static_joint_cam(outputs, endecoder: EnDecoder):
function process_ik (line 122) | def process_ik(outputs, endecoder):
FILE: eval/GVHMR/hmr4d/model/gvhmr/utils/stats_compose.py
function compose (line 190) | def compose(targets, sources):
FILE: eval/GVHMR/hmr4d/network/base_arch/embeddings/rotary_embedding.py
function rotate_half (line 7) | def rotate_half(x):
function apply_rotary_emb (line 15) | def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
function get_encoding (line 32) | def get_encoding(d_model, max_seq_len=4096):
class ROPE (line 41) | class ROPE(nn.Module):
method __init__ (line 44) | def __init__(self, d_model, max_seq_len=4096):
method rotate_queries_or_keys (line 53) | def rotate_queries_or_keys(self, x):
FILE: eval/GVHMR/hmr4d/network/base_arch/transformer/encoder_rope.py
class RoPEAttention (line 11) | class RoPEAttention(nn.Module):
method __init__ (line 12) | def __init__(self, embed_dim, num_heads, dropout=0.1):
method forward (line 26) | def forward(self, x, attn_mask=None, key_padding_mask=None):
class EncoderRoPEBlock (line 56) | class EncoderRoPEBlock(nn.Module):
method __init__ (line 57) | def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, dropout=0.1,...
method forward (line 73) | def forward(self, x, attn_mask=None, tgt_key_padding_mask=None):
method _sa_block (line 80) | def _sa_block(self, x, attn_mask=None, key_padding_mask=None):
FILE: eval/GVHMR/hmr4d/network/base_arch/transformer/layer.py
function zero_module (line 6) | def zero_module(module):
FILE: eval/GVHMR/hmr4d/network/gvhmr/relative_transformer.py
class NetworkEncoderRoPE (line 14) | class NetworkEncoderRoPE(nn.Module):
method __init__ (line 15) | def __init__(
method _build_condition_embedder (line 87) | def _build_condition_embedder(self):
method forward (line 109) | def forward(self, length, obs=None, f_cliffcam=None, f_cam_angvel=None...
FILE: eval/GVHMR/hmr4d/network/hmr2/__init__.py
function load_hmr2 (line 10) | def load_hmr2(checkpoint_path=HMR2A_CKPT):
FILE: eval/GVHMR/hmr4d/network/hmr2/components/pose_transformer.py
function exists (line 17) | def exists(val):
function default (line 21) | def default(val, d):
class PreNorm (line 27) | class PreNorm(nn.Module):
method __init__ (line 28) | def __init__(self, dim: int, fn: Callable, norm: str = "layer", norm_c...
method forward (line 33) | def forward(self, x: torch.Tensor, *args, **kwargs):
class FeedForward (line 40) | class FeedForward(nn.Module):
method __init__ (line 41) | def __init__(self, dim, hidden_dim, dropout=0.0):
method forward (line 51) | def forward(self, x):
class Attention (line 55) | class Attention(nn.Module):
method __init__ (line 56) | def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
method forward (line 75) | def forward(self, x):
class CrossAttention (line 89) | class CrossAttention(nn.Module):
method __init__ (line 90) | def __init__(self, dim, context_dim=None, heads=8, dim_head=64, dropou...
method forward (line 111) | def forward(self, x, context=None):
class Transformer (line 127) | class Transformer(nn.Module):
method __init__ (line 128) | def __init__(
method forward (line 153) | def forward(self, x: torch.Tensor, *args):
class TransformerCrossAttn (line 160) | class TransformerCrossAttn(nn.Module):
method __init__ (line 161) | def __init__(
method forward (line 191) | def forward(self, x: torch.Tensor, *args, context=None, context_list=N...
class DropTokenDropout (line 204) | class DropTokenDropout(nn.Module):
method __init__ (line 205) | def __init__(self, p: float = 0.1):
method forward (line 213) | def forward(self, x: torch.Tensor):
class ZeroTokenDropout (line 223) | class ZeroTokenDropout(nn.Module):
method __init__ (line 224) | def __init__(self, p: float = 0.1):
method forward (line 232) | def forward(self, x: torch.Tensor):
class TransformerEncoder (line 241) | class TransformerEncoder(nn.Module):
method __init__ (line 242) | def __init__(
method forward (line 283) | def forward(self, inp: torch.Tensor, *args, **kwargs):
class TransformerDecoder (line 301) | class TransformerDecoder(nn.Module):
method __init__ (line 302) | def __init__(
method forward (line 349) | def forward(self, inp: torch.Tensor, *args, context=None, context_list...
FILE: eval/GVHMR/hmr4d/network/hmr2/components/t_cond_mlp.py
class AdaptiveLayerNorm1D (line 7) | class AdaptiveLayerNorm1D(torch.nn.Module):
method __init__ (line 8) | def __init__(self, data_dim: int, norm_cond_dim: int):
method forward (line 21) | def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
class SequentialCond (line 36) | class SequentialCond(torch.nn.Sequential):
method forward (line 37) | def forward(self, input, *args, **kwargs):
function normalization_layer (line 48) | def normalization_layer(norm: Optional[str], dim: int, norm_cond_dim: in...
function linear_norm_activ_dropout (line 62) | def linear_norm_activ_dropout(
function create_simple_mlp (line 81) | def create_simple_mlp(
class ResidualMLPBlock (line 104) | class ResidualMLPBlock(torch.nn.Module):
method __init__ (line 105) | def __init__(
method forward (line 135) | def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
class ResidualMLP (line 139) | class ResidualMLP(torch.nn.Module):
method __init__ (line 140) | def __init__(
method forward (line 176) | def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
class FrequencyEmbedder (line 180) | class FrequencyEmbedder(torch.nn.Module):
method __init__ (line 181) | def __init__(self, num_frequencies, max_freq_log2):
method forward (line 186) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/network/hmr2/configs/__init__.py
function to_lower (line 10) | def to_lower(x: Dict) -> Dict:
function default_config (line 71) | def default_config() -> CN:
function dataset_config (line 80) | def dataset_config(name="datasets_tar.yaml") -> CN:
function dataset_eval_config (line 93) | def dataset_eval_config() -> CN:
function get_config (line 97) | def get_config(config_file: str, merge: bool = True) -> CN:
FILE: eval/GVHMR/hmr4d/network/hmr2/hmr2.py
class HMR2 (line 11) | class HMR2(pl.LightningModule):
method __init__ (line 12) | def __init__(self, cfg: CfgNode):
method forward (line 29) | def forward(self, batch, feat_mode=True):
FILE: eval/GVHMR/hmr4d/network/hmr2/smpl_head.py
class SMPLTransformerDecoderHead (line 11) | class SMPLTransformerDecoderHead(nn.Module):
method __init__ (line 14) | def __init__(self, cfg):
method forward (line 48) | def forward(self, x, only_return_token_out=False):
FILE: eval/GVHMR/hmr4d/network/hmr2/utils/geometry.py
function aa_to_rotmat (line 6) | def aa_to_rotmat(theta: torch.Tensor):
function quat_to_rotmat (line 25) | def quat_to_rotmat(quat: torch.Tensor) -> torch.Tensor:
function rot6d_to_rotmat (line 60) | def rot6d_to_rotmat(x: torch.Tensor) -> torch.Tensor:
function perspective_projection (line 78) | def perspective_projection(
FILE: eval/GVHMR/hmr4d/network/hmr2/utils/preproc.py
function expand_to_aspect_ratio (line 10) | def expand_to_aspect_ratio(input_shape, target_aspect_ratio=[192, 256]):
function crop_and_resize (line 32) | def crop_and_resize(img, bbx_xy, bbx_s, dst_size=256, enlarge_ratio=1.2):
FILE: eval/GVHMR/hmr4d/network/hmr2/utils/smpl_wrapper.py
class SMPL (line 10) | class SMPL(smplx.SMPLLayer):
method __init__ (line 11) | def __init__(self, *args, joint_regressor_extra: Optional[str] = None,...
method forward (line 29) | def forward(self, *args, **kwargs) -> SMPLOutput:
FILE: eval/GVHMR/hmr4d/network/hmr2/vit.py
function vit (line 12) | def vit(cfg):
function get_abs_pos (line 26) | def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
class DropPath (line 59) | class DropPath(nn.Module):
method __init__ (line 62) | def __init__(self, drop_prob=None):
method forward (line 66) | def forward(self, x):
method extra_repr (line 69) | def extra_repr(self):
class Mlp (line 72) | class Mlp(nn.Module):
method __init__ (line 73) | def __init__(self, in_features, hidden_features=None, out_features=Non...
method forward (line 82) | def forward(self, x):
class Attention (line 89) | class Attention(nn.Module):
method __init__ (line 90) | def __init__(
method forward (line 110) | def forward(self, x):
class Block (line 128) | class Block(nn.Module):
method __init__ (line 130) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
method forward (line 148) | def forward(self, x):
class PatchEmbed (line 154) | class PatchEmbed(nn.Module):
method __init__ (line 157) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
method forward (line 170) | def forward(self, x, **kwargs):
class HybridEmbed (line 179) | class HybridEmbed(nn.Module):
method __init__ (line 183) | def __init__(self, backbone, img_size=224, feature_size=None, in_chans...
method forward (line 204) | def forward(self, x):
class ViT (line 211) | class ViT(nn.Module):
method __init__ (line 213) | def __init__(self,
method _freeze_stages (line 259) | def _freeze_stages(self):
method init_weights (line 296) | def init_weights(self):
method get_num_layers (line 313) | def get_num_layers(self):
method no_weight_decay (line 317) | def no_weight_decay(self):
method forward_features (line 320) | def forward_features(self, x):
method forward (line 341) | def forward(self, x):
method train (line 345) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/body_model/body_model.py
class BodyModel (line 12) | class BodyModel(nn.Module):
method __init__ (line 18) | def __init__(self,
method forward (line 77) | def forward(self, root_orient=None, pose_body=None, pose_hand=None, po...
method forward_motion (line 119) | def forward_motion(self, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/body_model/body_model_smplh.py
class BodyModelSMPLH (line 15) | class BodyModelSMPLH(nn.Module):
method __init__ (line 18) | def __init__(self, model_path, **kwargs):
method forward (line 37) | def forward(
method get_skeleton (line 95) | def get_skeleton(self, betas):
FILE: eval/GVHMR/hmr4d/utils/body_model/body_model_smplx.py
class BodyModelSMPLX (line 19) | class BodyModelSMPLX(nn.Module):
method __init__ (line 22) | def __init__(self, model_path, **kwargs):
method forward (line 39) | def forward(
method get_skeleton (line 119) | def get_skeleton(self, betas):
method forward_bfc (line 124) | def forward_bfc(self, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/body_model/min_lbs.py
class MinimalLBS (line 10) | class MinimalLBS(nn.Module):
method __init__ (line 11) | def __init__(self, sp_ids, bm_dir='models/smplh', num_betas=16, model_...
method load_struct_on_sp (line 20) | def load_struct_on_sp(self, bm_path, prefix='m'):
method forward (line 51) | def forward(self, root_orient=None, pose_body=None, trans=None, betas=...
FILE: eval/GVHMR/hmr4d/utils/body_model/smpl_lite.py
class SmplLite (line 16) | class SmplLite(nn.Module):
method __init__ (line 17) | def __init__(
method register_smpl_buffers (line 39) | def register_smpl_buffers(self, data_struct, num_betas):
method register_fast_skeleton_computing_buffers (line 66) | def register_fast_skeleton_computing_buffers(self):
method get_skeleton (line 73) | def get_skeleton(self, betas):
method forward (line 76) | def forward(
class SmplxLiteJ24 (line 119) | class SmplxLiteJ24(SmplLite):
method __init__ (line 120) | def __init__(self, **kwargs):
method forward (line 138) | def forward(self, body_pose, betas, global_orient, transl):
FILE: eval/GVHMR/hmr4d/utils/body_model/smplx_lite.py
class SmplxLite (line 14) | class SmplxLite(nn.Module):
method __init__ (line 15) | def __init__(
method register_smpl_buffers (line 50) | def register_smpl_buffers(self, data_struct, num_betas):
method register_smplh_buffers (line 77) | def register_smplh_buffers(self, data_struct, num_pca_comps, flat_hand...
method register_smplx_buffers (line 93) | def register_smplx_buffers(self, data_struct):
method register_fast_skeleton_computing_buffers (line 98) | def register_fast_skeleton_computing_buffers(self):
method get_skeleton (line 105) | def get_skeleton(self, betas):
method forward (line 108) | def forward(
class SmplxLiteCoco17 (line 165) | class SmplxLiteCoco17(SmplxLite):
method __init__ (line 168) | def __init__(self, **kwargs):
method forward (line 188) | def forward(self, body_pose, betas, global_orient, transl):
class SmplxLiteV437Coco17 (line 196) | class SmplxLiteV437Coco17(SmplxLite):
method __init__ (line 197) | def __init__(self, **kwargs):
method forward (line 222) | def forward(self, body_pose, betas, global_orient, transl):
class SmplxLiteSmplN24 (line 236) | class SmplxLiteSmplN24(SmplxLite):
method __init__ (line 239) | def __init__(self, **kwargs):
method forward (line 259) | def forward(self, body_pose, betas, global_orient, transl):
function batch_rigid_transform_v2 (line 267) | def batch_rigid_transform_v2(rot_mats, joints, parents):
function sync_time (line 300) | def sync_time():
FILE: eval/GVHMR/hmr4d/utils/body_model/utils.py
function smpl_to_openpose (line 120) | def smpl_to_openpose(model_type='smplx', use_hands=True, use_face=True,
FILE: eval/GVHMR/hmr4d/utils/callbacks/prog_bar.py
function format_num (line 21) | def format_num(n):
function convert_kwargs_to_str (line 27) | def convert_kwargs_to_str(**kwargs):
function convert_t_to_str (line 47) | def convert_t_to_str(t):
class MyTQDMProgressBar (line 58) | class MyTQDMProgressBar(TQDMProgressBar, pl.Callback):
method init_train_tqdm (line 59) | def init_train_tqdm(self):
method on_train_batch_end (line 72) | def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch...
class ProgressReporter (line 95) | class ProgressReporter(ProgressBar, pl.Callback):
method __init__ (line 96) | def __init__(
method disable (line 115) | def disable(self):
method setup (line 118) | def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, st...
method print (line 136) | def print(self, *args: Any, **kwargs: Any) -> None:
method get_metrics (line 139) | def get_metrics(self, trainer: pl.Trainer, pl_module: pl.LightningModu...
method _should_update (line 145) | def _should_update(self, n_finished: int, total: int) -> bool:
method on_train_epoch_start (line 157) | def on_train_epoch_start(self, trainer: "pl.Trainer", *_: Any) -> None:
method on_train_batch_end (line 165) | def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch...
method on_train_epoch_end (line 218) | def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.Lightn...
method on_validation_epoch_start (line 245) | def on_validation_epoch_start(self, trainer, pl_module):
method on_validation_batch_end (line 249) | def on_validation_batch_end(self, trainer, pl_module, outputs, batch, ...
method on_validation_epoch_end (line 272) | def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.L...
class EmojiProgressReporter (line 277) | class EmojiProgressReporter(ProgressBar, pl.Callback):
method __init__ (line 278) | def __init__(
method disable (line 297) | def disable(self):
method setup (line 300) | def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, st...
method print (line 312) | def print(self, *args: Any, **kwargs: Any):
method get_metrics (line 315) | def get_metrics(self, trainer: pl.Trainer, pl_module: pl.LightningModu...
method _should_log_batch (line 321) | def _should_log_batch(self, n: int) -> bool:
method _should_log_epoch (line 330) | def _should_log_epoch(self, n: int) -> bool:
method timestamp_delta_to_str (line 336) | def timestamp_delta_to_str(self, timestamp_delta: float):
method on_train_batch_start (line 357) | def on_train_batch_start(self, trainer: pl.Trainer, pl_module: pl.Ligh...
method on_train_batch_end (line 364) | def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch...
method on_train_epoch_start (line 391) | def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.Ligh...
method on_train_epoch_end (line 399) | def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.Lightn...
FILE: eval/GVHMR/hmr4d/utils/callbacks/simple_ckpt_saver.py
class SimpleCkptSaver (line 11) | class SimpleCkptSaver(Checkpoint):
method __init__ (line 17) | def __init__(
method on_train_epoch_end (line 40) | def on_train_epoch_end(self, trainer, pl_module):
FILE: eval/GVHMR/hmr4d/utils/callbacks/train_speed_timer.py
class TrainSpeedTimer (line 9) | class TrainSpeedTimer(pl.Callback):
method __init__ (line 10) | def __init__(self, N_avg=5):
method on_train_batch_start (line 25) | def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
method on_train_batch_end (line 43) | def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch...
method on_train_epoch_end (line 60) | def on_train_epoch_end(self, trainer, pl_module):
FILE: eval/GVHMR/hmr4d/utils/comm/gather.py
function get_world_size (line 22) | def get_world_size() -> int:
function get_rank (line 30) | def get_rank() -> int:
function get_local_rank (line 38) | def get_local_rank() -> int:
function get_local_size (line 51) | def get_local_size() -> int:
function is_main_process (line 64) | def is_main_process() -> bool:
function synchronize (line 68) | def synchronize():
function _get_global_gloo_group (line 84) | def _get_global_gloo_group():
function _serialize_to_tensor (line 95) | def _serialize_to_tensor(data, group):
function _pad_to_largest_tensor (line 113) | def _pad_to_largest_tensor(tensor, group):
function all_gather (line 137) | def all_gather(data, group=None):
function gather (line 173) | def gather(data, dst=0, group=None):
function shared_random_seed (line 214) | def shared_random_seed():
function reduce_dict (line 228) | def reduce_dict(input_dict, average=True):
FILE: eval/GVHMR/hmr4d/utils/eval/eval_utils.py
function compute_camcoord_metrics (line 6) | def compute_camcoord_metrics(batch, pelvis_idxs=[1, 2], fps=30, mask=None):
function compute_global_metrics (line 60) | def compute_global_metrics(batch, mask=None):
function compute_camcoord_perjoint_metrics (line 138) | def compute_camcoord_perjoint_metrics(batch, pelvis_idxs=[1, 2]):
function compute_jpe (line 176) | def compute_jpe(S1, S2):
function compute_perjoint_jpe (line 180) | def compute_perjoint_jpe(S1, S2):
function batch_align_by_pelvis (line 184) | def batch_align_by_pelvis(data_list, pelvis_idxs=[1, 2]):
function batch_compute_similarity_transform_torch (line 206) | def batch_compute_similarity_transform_torch(S1, S2):
function compute_error_accel (line 260) | def compute_error_accel(joints_gt, joints_pred, valid_mask=None, fps=None):
function compute_rte (line 294) | def compute_rte(target_trans, pred_trans):
function compute_jitter (line 313) | def compute_jitter(joints, fps=30):
function compute_foot_sliding (line 329) | def compute_foot_sliding(target_verts, pred_verts, thr=1e-2):
function convert_joints22_to_24 (line 357) | def convert_joints22_to_24(joints22, ratio2220=0.3438, ratio2321=0.3345):
function align_pcl (line 365) | def align_pcl(Y, X, weight=None, fixed_scale=False):
function global_align_joints (line 413) | def global_align_joints(gt_joints, pred_joints):
function first_align_joints (line 423) | def first_align_joints(gt_joints, pred_joints):
function rearrange_by_mask (line 435) | def rearrange_by_mask(x, mask):
function as_np_array (line 451) | def as_np_array(d):
FILE: eval/GVHMR/hmr4d/utils/geo/augment_noisy_pose.py
function gaussian_augment (line 11) | def gaussian_augment(body_pose, std_angle=10.0, to_R=True):
function get_jitter (line 53) | def get_jitter(shape=(8, 120), s_jittering=5e-2):
function get_jitter_cuda (line 65) | def get_jitter_cuda(shape=(8, 120), s_jittering=5e-2):
function get_lfhp (line 73) | def get_lfhp(shape=(8, 120), s_peak=3e-1, s_peak_mask=5e-3):
function get_lfhp_cuda (line 87) | def get_lfhp_cuda(shape=(8, 120), s_peak=3e-1, s_peak_mask=5e-3):
function get_bias (line 101) | def get_bias(shape=(8, 120), s_bias=1e-1):
function get_bias_cuda (line 109) | def get_bias_cuda(shape=(8, 120), s_bias=1e-1):
function get_wham_aug_kp3d (line 119) | def get_wham_aug_kp3d(shape=(8, 120)):
function get_visible_mask (line 125) | def get_visible_mask(shape=(8, 120), s_mask=0.03):
function get_invisible_legs_mask (line 144) | def get_invisible_legs_mask(shape, s_mask=0.03):
function randomly_occlude_lower_half (line 159) | def randomly_occlude_lower_half(i_x2d, s_mask=0.03):
function randomly_modify_hands_legs (line 182) | def randomly_modify_hands_legs(j3d):
FILE: eval/GVHMR/hmr4d/utils/geo/flip_utils.py
function flip_heatmap_coco17 (line 5) | def flip_heatmap_coco17(output_flipped):
function flip_bbx_xys (line 22) | def flip_bbx_xys(bbx_xys, w):
function flip_kp2d_coco17 (line 31) | def flip_kp2d_coco17(kp2d, w):
function flip_smplx_params (line 40) | def flip_smplx_params(smplx_params):
function avg_smplx_aa (line 69) | def avg_smplx_aa(aa1, aa2):
FILE: eval/GVHMR/hmr4d/utils/geo/hmr_cam.py
function estimate_focal_length (line 6) | def estimate_focal_length(img_w, img_h):
function estimate_K (line 10) | def estimate_K(img_w, img_h):
function convert_K_to_K4 (line 20) | def convert_K_to_K4(K):
function convert_f_to_K (line 25) | def convert_f_to_K(focal_length, img_w, img_h):
function resize_K (line 34) | def resize_K(K, f=0.5):
function create_camera_sensor (line 40) | def create_camera_sensor(width=None, height=None, f_fullframe=None):
function convert_xys_to_cliff_cam_wham (line 72) | def convert_xys_to_cliff_cam_wham(xys, res):
function compute_bbox_info_bedlam (line 103) | def compute_bbox_info_bedlam(bbx_xys, K_fullimg):
function compute_transl_full_cam (line 124) | def compute_transl_full_cam(pred_cam, bbx_xys, K_fullimg):
function get_a_pred_cam (line 139) | def get_a_pred_cam(transl, bbx_xys, K_fullimg):
function project_to_bi01 (line 156) | def project_to_bi01(points, bbx_xys, K_fullimg):
function perspective_projection (line 169) | def perspective_projection(points, K):
function normalize_kp2d (line 180) | def normalize_kp2d(obs_kp2d, bbx_xys, clamp_scale_min=False):
function get_bbx_xys (line 210) | def get_bbx_xys(i_j2d, bbx_ratio=[192, 256], do_augment=False, base_enla...
function safely_render_x3d_K (line 254) | def safely_render_x3d_K(x3d, K_fullimg, thr):
function get_bbx_xys_from_xyxy (line 285) | def get_bbx_xys_from_xyxy(bbx_xyxy, base_enlarge=1.2):
function bbx_xyxy_from_x (line 298) | def bbx_xyxy_from_x(p2d):
function bbx_xyxy_from_masked_x (line 316) | def bbx_xyxy_from_masked_x(p2d, mask):
function bbx_xyxy_ratio (line 349) | def bbx_xyxy_ratio(xyxy1, xyxy2):
function get_mesh_in_fov_category (line 365) | def get_mesh_in_fov_category(mask):
function get_infov_mask (line 385) | def get_infov_mask(p2d, w_real, h_real):
FILE: eval/GVHMR/hmr4d/utils/geo/hmr_global.py
function get_R_c2gv (line 7) | def get_R_c2gv(R_w2c, axis_gravity_in_w=[0, 0, -1]):
function get_tgtcoord_rootparam (line 40) | def get_tgtcoord_rootparam(global_orient, transl, gravity_vec=None, tgt_...
function get_c_rootparam (line 79) | def get_c_rootparam(global_orient, transl, T_w2c, offset):
function get_T_w2c_from_wcparams (line 105) | def get_T_w2c_from_wcparams(global_orient_w, transl_w, global_orient_c, ...
function get_local_transl_vel (line 132) | def get_local_transl_vel(transl, global_orient):
function rollout_local_transl_vel (line 151) | def rollout_local_transl_vel(local_transl_vel, global_orient, transl_0=N...
function get_local_transl_vel_alignhead (line 174) | def get_local_transl_vel_alignhead(transl, global_orient):
function rollout_local_transl_vel_alignhead (line 189) | def rollout_local_transl_vel_alignhead(local_transl_vel_alignhead, globa...
function get_local_transl_vel_alignhead_absy (line 204) | def get_local_transl_vel_alignhead_absy(transl, global_orient):
function rollout_local_transl_vel_alignhead_absy (line 224) | def rollout_local_transl_vel_alignhead_absy(local_transl_vel_alignhead_a...
function get_local_transl_vel_alignhead_absgy (line 254) | def get_local_transl_vel_alignhead_absgy(transl, global_orient):
function rollout_local_transl_vel_alignhead_absgy (line 274) | def rollout_local_transl_vel_alignhead_absgy(local_transl_vel_alignhead_...
function rollout_vel (line 312) | def rollout_vel(vel, transl_0=None):
function get_static_joint_mask (line 331) | def get_static_joint_mask(w_j3d, vel_thr=0.25, smooth=False, repeat_last...
FILE: eval/GVHMR/hmr4d/utils/geo/quaternion.py
function qinv (line 20) | def qinv(q):
function qinv_np (line 27) | def qinv_np(q):
function qnormalize (line 32) | def qnormalize(q):
function qmul (line 37) | def qmul(q, r):
function qrot (line 58) | def qrot(q, v):
function qeuler (line 80) | def qeuler(q, order, epsilon=0, deg=True):
function qmul_np (line 133) | def qmul_np(q, r):
function qrot_np (line 139) | def qrot_np(q, v):
function qeuler_np (line 145) | def qeuler_np(q, order, epsilon=0, use_gpu=False):
function qfix (line 154) | def qfix(q):
function euler2quat (line 174) | def euler2quat(e, order, deg=True):
function expmap_to_quaternion (line 219) | def expmap_to_quaternion(e):
function euler_to_quaternion (line 238) | def euler_to_quaternion(e, order):
function quaternion_to_matrix (line 279) | def quaternion_to_matrix(quaternions):
function quaternion_to_matrix_np (line 308) | def quaternion_to_matrix_np(quaternions):
function quaternion_to_cont6d_np (line 313) | def quaternion_to_cont6d_np(quaternions):
function quaternion_to_cont6d (line 319) | def quaternion_to_cont6d(quaternions):
function cont6d_to_matrix (line 325) | def cont6d_to_matrix(cont6d):
function cont6d_to_matrix_np (line 344) | def cont6d_to_matrix_np(cont6d):
function qpow (line 349) | def qpow(q0, t, dtype=torch.float):
function qslerp (line 376) | def qslerp(q0, q1, t):
function qbetween (line 393) | def qbetween(v0, v1):
function qbetween_np (line 417) | def qbetween_np(v0, v1):
function lerp (line 429) | def lerp(p0, p1, t):
FILE: eval/GVHMR/hmr4d/utils/geo/transforms.py
function axis_rotate_to_matrix (line 4) | def axis_rotate_to_matrix(angle, axis="x"):
FILE: eval/GVHMR/hmr4d/utils/geo_transform.py
function homo_points (line 15) | def homo_points(points):
function apply_Ts_on_seq_points (line 24) | def apply_Ts_on_seq_points(points, Ts):
function apply_T_on_points (line 36) | def apply_T_on_points(points, T):
function T_transforms_points (line 47) | def T_transforms_points(T, points, pattern):
function project_p2d (line 56) | def project_p2d(points, K=None, is_pinhole=True):
function gen_uv_from_HW (line 83) | def gen_uv_from_HW(H, W, device="cpu"):
function unproject_p2d (line 96) | def unproject_p2d(uv, z, K):
function cvt_p2d_from_i_to_c (line 108) | def cvt_p2d_from_i_to_c(uv, K):
function cvt_to_bi01_p2d (line 122) | def cvt_to_bi01_p2d(p2d, bbx_lurb):
function cvt_from_bi01_p2d (line 135) | def cvt_from_bi01_p2d(bi01_p2d, bbx_lurb):
function cvt_p2d_from_bi01_to_c (line 151) | def cvt_p2d_from_bi01_to_c(bi01, bbxs_lurb, Ks):
function cvt_p2d_from_pm1_to_i (line 165) | def cvt_p2d_from_pm1_to_i(p2d_pm1, bbx_xys):
function uv2l_index (line 176) | def uv2l_index(uv, W):
function l2uv_index (line 180) | def l2uv_index(l, W):
function transform_mat (line 186) | def transform_mat(R, t):
function axis_angle_to_matrix_exp_map (line 200) | def axis_angle_to_matrix_exp_map(aa):
function matrix_to_axis_angle_log_map (line 212) | def matrix_to_axis_angle_log_map(R):
function matrix_to_axis_angle (line 224) | def matrix_to_axis_angle(R):
function ransac_PnP (line 234) | def ransac_PnP(K, pts_2d, pts_3d, err_thr=10):
function ransac_PnP_batch (line 260) | def ransac_PnP_batch(K_raw, pts_2d, pts_3d, err_thr=10):
function triangulate_point (line 271) | def triangulate_point(Ts_w2c, c_p2d, **kwargs):
function triangulate_point_ortho (line 278) | def triangulate_point_ortho(Ts_w2c, c_p2d, **kwargs):
function get_nearby_points (line 285) | def get_nearby_points(points, query_verts, padding=0.0, p=1):
function unproj_bbx_to_fst (line 303) | def unproj_bbx_to_fst(bbx_lurb, K, near_z=0.5, far_z=12.5):
function convert_bbx_xys_to_lurb (line 314) | def convert_bbx_xys_to_lurb(bbx_xys):
function convert_lurb_to_bbx_xys (line 324) | def convert_lurb_to_bbx_xys(bbx_lurb):
function compute_T_ayf2az (line 336) | def compute_T_ayf2az(joints, inverse=False):
function compute_T_ayfz2ay (line 371) | def compute_T_ayfz2ay(joints, inverse=False):
function compute_T_ay2ayrot (line 407) | def compute_T_ay2ayrot(joints):
function compute_root_quaternion_ay (line 428) | def compute_root_quaternion_ay(joints):
function similarity_transform_batch (line 463) | def similarity_transform_batch(S1, S2):
function kabsch_algorithm_batch (line 520) | def kabsch_algorithm_batch(X1, X2):
function compute_cam_angvel (line 567) | def compute_cam_angvel(R_w2c, padding_last=True):
function ransac_gravity_vec (line 579) | def ransac_gravity_vec(xyz, num_iterations=100, threshold=0.05, verbose=...
function sequence_best_cammat (line 612) | def sequence_best_cammat(w_j3d, c_j3d, cam_rot):
function get_sequence_cammat (line 641) | def get_sequence_cammat(w_j3d, c_j3d, cam_rot):
function ransac_vec (line 656) | def ransac_vec(vel, min_multiply=20, verbose=False):
FILE: eval/GVHMR/hmr4d/utils/ik/ccd_ik.py
class CCD_IK (line 19) | class CCD_IK:
method __init__ (line 20) | def __init__(
method is_converged (line 64) | def is_converged(self):
method solve (line 72) | def solve(self):
method optimize (line 80) | def optimize(self, i):
method get_weight (line 147) | def get_weight(self, i):
FILE: eval/GVHMR/hmr4d/utils/kpts/kp2d_utils.py
function _taylor (line 10) | def _taylor(heatmap, coord):
function _get_max_preds (line 44) | def _get_max_preds(heatmaps):
function post_dark_udp (line 78) | def post_dark_udp(coords, batch_heatmaps, kernel=3):
function _gaussian_blur (line 140) | def _gaussian_blur(heatmaps, kernel=11):
function keypoints_from_heatmaps (line 181) | def keypoints_from_heatmaps(
function transform_preds (line 328) | def transform_preds(coords, center, scale, output_size, use_udp=False):
FILE: eval/GVHMR/hmr4d/utils/matrix.py
function identity_mat (line 12) | def identity_mat(x=None, device="cpu", is_numpy=False):
function vec2mat (line 35) | def vec2mat(vec):
function mat2vec (line 69) | def mat2vec(mat):
function vec2mat_batch (line 93) | def vec2mat_batch(vec):
function rotmat2tan_norm (line 128) | def rotmat2tan_norm(mat):
function mat2tan_norm (line 158) | def mat2tan_norm(mat):
function rotmat2tan_norm (line 171) | def rotmat2tan_norm(mat):
function tan_norm2rotmat (line 203) | def tan_norm2rotmat(tan_norm):
function rotmat332vec_batch (line 234) | def rotmat332vec_batch(mat):
function rotmat2vec_batch (line 257) | def rotmat2vec_batch(mat):
function mat2vec_batch (line 280) | def mat2vec_batch(mat):
function mat2pose_batch (line 304) | def mat2pose_batch(mat, returnvel=True):
function get_mat_BinA (line 335) | def get_mat_BinA(matCtoA, matCtoB):
function get_mat_BtoA (line 359) | def get_mat_BtoA(matA, matB):
function get_mat_BfromA (line 382) | def get_mat_BfromA(matA, matBtoA):
function get_relative_position_to (line 398) | def get_relative_position_to(pos, mat):
function get_rotation (line 423) | def get_rotation(mat):
function set_rotation (line 435) | def set_rotation(mat, rotmat):
function set_position (line 448) | def set_position(mat, pos):
function get_position (line 461) | def get_position(mat):
function get_position_from (line 473) | def get_position_from(pos, mat):
function get_position_from_rotmat (line 494) | def get_position_from_rotmat(pos, mat):
function get_relative_direction_to (line 513) | def get_relative_direction_to(dir, mat):
function get_direction_from (line 542) | def get_direction_from(dir, mat):
function get_coord_vis (line 564) | def get_coord_vis(pos, rot_mat, scale=1.0):
function project_vec (line 571) | def project_vec(vec):
function xz2xyz (line 594) | def xz2xyz(vec):
function normalized (line 609) | def normalized(vec):
function normalized_matrix (line 620) | def normalized_matrix(mat):
function get_rot_mat_from_forward (line 642) | def get_rot_mat_from_forward(forward):
function get_rot_mat_from_forward_up (line 674) | def get_rot_mat_from_forward_up(forward, up):
function get_rot_mat_from_pose_vec (line 703) | def get_rot_mat_from_pose_vec(vec):
function get_TRS (line 717) | def get_TRS(rot_mat, pos):
function xzvec2mat (line 742) | def xzvec2mat(vec):
function distance (line 770) | def distance(vec1, vec2):
function get_relative_pose_from_vec (line 774) | def get_relative_pose_from_vec(pose, root, N):
function get_forward_from_pos (line 787) | def get_forward_from_pos(pos):
function project_point_along_ray (line 808) | def project_point_along_ray(p, ray, keepnorm=False):
function solve_point_along_ray_with_constraint (line 829) | def solve_point_along_ray_with_constraint(c, ray, p, constraint="x"):
function calc_cosine (line 855) | def calc_cosine(vec1, vec2, return_angle=False):
function quat_xyzw2wxyz (line 881) | def quat_xyzw2wxyz(quat):
function quat_wxyz2xyzw (line 886) | def quat_wxyz2xyzw(quat):
function quat_mul (line 891) | def quat_mul(a, b):
function quat_pos (line 906) | def quat_pos(x):
function quat_abs (line 916) | def quat_abs(x):
function quat_unit (line 924) | def quat_unit(x):
function quat_conjugate (line 932) | def quat_conjugate(x):
function quat_real (line 939) | def quat_real(x):
function quat_imaginary (line 946) | def quat_imaginary(x):
function quat_norm_check (line 953) | def quat_norm_check(x):
function quat_normalize (line 963) | def quat_normalize(q):
function quat_from_xyz (line 971) | def quat_from_xyz(xyz):
function quat_identity (line 980) | def quat_identity(shape: List[int]):
function tgm_quat_from_angle_axis (line 990) | def tgm_quat_from_angle_axis(angle, axis, degree: bool = False):
function quat_from_rotation_matrix (line 1013) | def quat_from_rotation_matrix(m):
function quat_mul_norm (line 1062) | def quat_mul_norm(x, y):
function quat_rotate (line 1070) | def quat_rotate(rot, vec):
function quat_inverse (line 1078) | def quat_inverse(x):
function quat_identity_like (line 1085) | def quat_identity_like(x):
function quat_angle_axis (line 1092) | def quat_angle_axis(x):
function quat_yaw_rotation (line 1104) | def quat_yaw_rotation(x, z_up: bool = True):
function transform_from_rotation_translation (line 1124) | def transform_from_rotation_translation(r: Optional[torch.Tensor] = None...
function transform_identity (line 1137) | def transform_identity(shape: List[int]):
function transform_rotation (line 1146) | def transform_rotation(x):
function transform_translation (line 1151) | def transform_translation(x):
function transform_inverse (line 1156) | def transform_inverse(x):
function transform_identity_like (line 1164) | def transform_identity_like(x):
function transform_mul (line 1171) | def transform_mul(x, y):
function transform_apply (line 1182) | def transform_apply(rot, vec):
function rot_matrix_det (line 1190) | def rot_matrix_det(x):
function rot_matrix_integrity_check (line 1204) | def rot_matrix_integrity_check(x):
function rot_matrix_from_quaternion (line 1218) | def rot_matrix_from_quaternion(q):
function euclidean_to_rotation_matrix (line 1245) | def euclidean_to_rotation_matrix(x):
function euclidean_integrity_check (line 1252) | def euclidean_integrity_check(x):
function euclidean_translation (line 1258) | def euclidean_translation(x):
function euclidean_inverse (line 1265) | def euclidean_inverse(x):
function euclidean_to_transform (line 1276) | def euclidean_to_transform(transformation_matrix):
function to_torch (line 1286) | def to_torch(x, dtype=torch.float, device="cuda:0", requires_grad=False):
function quat_mul (line 1290) | def quat_mul(a, b):
function normalize (line 1313) | def normalize(x, eps: float = 1e-9):
function quat_apply (line 1317) | def quat_apply(a, b):
function quat_rotate (line 1326) | def quat_rotate(q, v):
function quat_rotate_inverse (line 1336) | def quat_rotate_inverse(q, v):
function quat_conjugate (line 1346) | def quat_conjugate(a):
function quat_unit (line 1352) | def quat_unit(a):
function quat_from_angle_axis (line 1356) | def quat_from_angle_axis(angle, axis):
function normalize_angle (line 1363) | def normalize_angle(x):
function tf_inverse (line 1367) | def tf_inverse(q, t):
function tf_apply (line 1372) | def tf_apply(q, t, v):
function tf_vector (line 1376) | def tf_vector(q, v):
function tf_combine (line 1380) | def tf_combine(q1, t1, q2, t2):
function get_basis_vector (line 1384) | def get_basis_vector(q, v):
function get_axis_params (line 1388) | def get_axis_params(value, axis_idx, x_value=0.0, dtype=float, n_dims=3):
function copysign (line 1398) | def copysign(a, b):
function get_euler_xyz (line 1404) | def get_euler_xyz(q):
function quat_from_euler_xyz (line 1423) | def quat_from_euler_xyz(roll, pitch, yaw):
function torch_rand_float (line 1439) | def torch_rand_float(lower, upper, shape, device):
function torch_random_dir_2 (line 1444) | def torch_random_dir_2(shape, device):
function tensor_clamp (line 1450) | def tensor_clamp(t, min_t, max_t):
function scale (line 1454) | def scale(x, lower, upper):
function unscale (line 1458) | def unscale(x, lower, upper):
function unscale_np (line 1462) | def unscale_np(x, lower, upper):
function quat_to_angle_axis (line 1466) | def quat_to_angle_axis(q):
function angle_axis_to_exp_map (line 1489) | def angle_axis_to_exp_map(angle, axis):
function quat_to_exp_map (line 1497) | def quat_to_exp_map(q):
function quat_to_tan_norm (line 1506) | def quat_to_tan_norm(q):
function euler_xyz_to_exp_map (line 1521) | def euler_xyz_to_exp_map(roll, pitch, yaw):
function exp_map_to_angle_axis (line 1528) | def exp_map_to_angle_axis(exp_map):
function exp_map_to_quat (line 1547) | def exp_map_to_quat(exp_map):
function slerp (line 1553) | def slerp(q0, q1, t):
function calc_heading_vec (line 1577) | def calc_heading_vec(q, head_ind=0):
function calc_heading (line 1589) | def calc_heading(q, head_ind=0, gravity_axis="z"):
function calc_heading_quat (line 1611) | def calc_heading_quat(q, head_ind=0, gravity_axis="z"):
function calc_heading_quat_inv (line 1630) | def calc_heading_quat_inv(q, head_ind=0):
function forward_kinematics (line 1643) | def forward_kinematics(mat, parent):
FILE: eval/GVHMR/hmr4d/utils/net_utils.py
function load_pretrained_model (line 11) | def load_pretrained_model(model, ckpt_path):
function find_last_ckpt_path (line 25) | def find_last_ckpt_path(dirpath):
function get_resume_ckpt_path (line 50) | def get_resume_ckpt_path(resume_mode, ckpt_dir=None):
function select_state_dict_by_prefix (line 57) | def select_state_dict_by_prefix(state_dict, prefix, new_prefix=""):
function detach_to_cpu (line 75) | def detach_to_cpu(in_dict):
function to_cuda (line 79) | def to_cuda(data):
function get_valid_mask (line 91) | def get_valid_mask(max_len, valid_len, device="cpu"):
function length_to_mask (line 97) | def length_to_mask(lengths, max_len):
function repeat_to_max_len (line 105) | def repeat_to_max_len(x, max_len, dim=0):
function repeat_to_max_len_dict (line 120) | def repeat_to_max_len_dict(x_dict, max_len, dim=0):
class Transpose (line 126) | class Transpose(nn.Module):
method __init__ (line 127) | def __init__(self, dim1, dim2):
method forward (line 132) | def forward(self, x):
class GaussianSmooth (line 136) | class GaussianSmooth(nn.Module):
method __init__ (line 137) | def __init__(self, sigma=3, dim=-1):
method forward (line 144) | def forward(self, x):
function gaussian_smooth (line 158) | def gaussian_smooth(x, sigma=3, dim=-1):
function moving_average_smooth (line 173) | def moving_average_smooth(x, window_size=5, dim=-1):
FILE: eval/GVHMR/hmr4d/utils/preproc/slam.py
class SLAMModel (line 18) | class SLAMModel(object):
method __init__ (line 19) | def __init__(self, video_path, width, height, intrinsics=None, stride=...
method track (line 41) | def track(self):
method process (line 62) | def process(self):
function video_stream (line 70) | def video_stream(queue, imagedir, intrinsics, stride, skip=0, resize=0.5):
FILE: eval/GVHMR/hmr4d/utils/preproc/tracker.py
class Tracker (line 19) | class Tracker:
method __init__ (line 20) | def __init__(self) -> None:
method track (line 24) | def track(self, video_path):
method sort_track_length (line 48) | def sort_track_length(track_history, video_path):
method get_one_track (line 75) | def get_one_track(self, video_path):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitfeat_extractor.py
function get_batch (line 13) | def get_batch(input_path, bbx_xys, img_ds=0.5, img_dst_size=256, path_ty...
class Extractor (line 60) | class Extractor:
method __init__ (line 61) | def __init__(self, tqdm_leave=True):
method extract_video_features (line 65) | def extract_video_features(self, video_path, bbx_xys, img_ds=0.5):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose.py
class VitPoseExtractor (line 13) | class VitPoseExtractor:
method __init__ (line 14) | def __init__(self, tqdm_leave=True):
method extract (line 23) | def extract(self, video_path, bbx_xys, img_ds=0.5):
function get_heatmap_preds (line 75) | def get_heatmap_preds(heatmap, normalize_keypoints=True, thr=0.0, soft=F...
function soft_patch_dx_dy (line 123) | def soft_patch_dx_dy(p):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/alexnet.py
class AlexNet (line 9) | class AlexNet(BaseBackbone):
method __init__ (line 20) | def __init__(self, num_classes=-1):
method forward (line 49) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/cpm.py
class CpmBlock (line 15) | class CpmBlock(nn.Module):
method __init__ (line 24) | def __init__(self,
method forward (line 47) | def forward(self, x):
class CPM (line 54) | class CPM(BaseBackbone):
method __init__ (line 86) | def __init__(self,
method init_weights (line 149) | def init_weights(self, pretrained=None):
method forward (line 168) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/hourglass.py
class HourglassModule (line 15) | class HourglassModule(nn.Module):
method __init__ (line 29) | def __init__(self,
method forward (line 78) | def forward(self, x):
class HourglassNet (line 89) | class HourglassNet(BaseBackbone):
method __init__ (line 120) | def __init__(self,
method init_weights (line 174) | def init_weights(self, pretrained=None):
method forward (line 193) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/hourglass_ae.py
class HourglassAEModule (line 14) | class HourglassAEModule(nn.Module):
method __init__ (line 26) | def __init__(self,
method forward (line 58) | def forward(self, x):
class HourglassAENet (line 70) | class HourglassAENet(BaseBackbone):
method __init__ (line 102) | def __init__(self,
method init_weights (line 175) | def init_weights(self, pretrained=None):
method forward (line 194) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/hrformer.py
function nlc_to_nchw (line 18) | def nlc_to_nchw(x, hw_shape):
function nchw_to_nlc (line 35) | def nchw_to_nlc(x):
function build_drop_path (line 48) | def build_drop_path(drop_path_rate):
class WindowMSA (line 53) | class WindowMSA(BaseModule):
method __init__ (line 74) | def __init__(self,
method init_weights (line 113) | def init_weights(self):
method forward (line 116) | def forward(self, x, mask=None):
method double_step_seq (line 157) | def double_step_seq(step1, len1, step2, len2):
class LocalWindowSelfAttention (line 163) | class LocalWindowSelfAttention(BaseModule):
method __init__ (line 188) | def __init__(self,
method forward (line 215) | def forward(self, x, H, W, **kwargs):
class CrossFFN (line 260) | class CrossFFN(BaseModule):
method __init__ (line 277) | def __init__(self,
method forward (line 310) | def forward(self, x, H, W):
class HRFormerBlock (line 319) | class HRFormerBlock(BaseModule):
method __init__ (line 342) | def __init__(self,
method forward (line 380) | def forward(self, x):
method extra_repr (line 391) | def extra_repr(self):
class HRFomerModule (line 397) | class HRFomerModule(HRModule):
method __init__ (line 430) | def __init__(self,
method _make_one_branch (line 461) | def _make_one_branch(self,
method _make_fuse_layers (line 504) | def _make_fuse_layers(self):
method get_num_inchannels (line 574) | def get_num_inchannels(self):
class HRFormer (line 580) | class HRFormer(HRNet):
method __init__ (line 663) | def __init__(self,
method _make_stage (line 701) | def _make_stage(self,
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/litehrnet.py
class SpatialWeighting (line 21) | class SpatialWeighting(nn.Module):
method __init__ (line 36) | def __init__(self,
method forward (line 65) | def forward(self, x):
class CrossResolutionWeighting (line 72) | class CrossResolutionWeighting(nn.Module):
method __init__ (line 87) | def __init__(self,
method forward (line 117) | def forward(self, x):
class ConditionalChannelWeighting (line 131) | class ConditionalChannelWeighting(nn.Module):
method __init__ (line 146) | def __init__(self,
method forward (line 184) | def forward(self, x):
class Stem (line 208) | class Stem(nn.Module):
method __init__ (line 225) | def __init__(self,
method forward (line 309) | def forward(self, x):
class IterativeHead (line 333) | class IterativeHead(nn.Module):
method __init__ (line 342) | def __init__(self, in_channels, norm_cfg=dict(type='BN')):
method forward (line 375) | def forward(self, x):
class ShuffleUnit (line 395) | class ShuffleUnit(nn.Module):
method __init__ (line 412) | def __init__(self,
method forward (line 489) | def forward(self, x):
class LiteHRModule (line 510) | class LiteHRModule(nn.Module):
method __init__ (line 531) | def __init__(
method _check_branches (line 567) | def _check_branches(self, num_branches, in_channels):
method _make_weighting_blocks (line 574) | def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1):
method _make_one_branch (line 589) | def _make_one_branch(self, branch_index, num_blocks, stride=1):
method _make_naive_branches (line 614) | def _make_naive_branches(self, num_branches, num_blocks):
method _make_fuse_layers (line 623) | def _make_fuse_layers(self):
method forward (line 708) | def forward(self, x):
class LiteHRNet (line 738) | class LiteHRNet(nn.Module):
method __init__ (line 785) | def __init__(self,
method _make_transition_layer (line 831) | def _make_transition_layer(self, num_channels_pre_layer,
method _make_stage (line 899) | def _make_stage(self,
method init_weights (line 935) | def init_weights(self, pretrained=None):
method forward (line 954) | def forward(self, x):
method train (line 978) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/mobilenet_v2.py
class InvertedResidual (line 15) | class InvertedResidual(nn.Module):
method __init__ (line 34) | def __init__(self,
method forward (line 85) | def forward(self, x):
class MobileNetV2 (line 101) | class MobileNetV2(BaseBackbone):
method __init__ (line 130) | def __init__(self,
method make_layer (line 204) | def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
method init_weights (line 232) | def init_weights(self, pretrained=None):
method forward (line 245) | def forward(self, x):
method _freeze_stages (line 259) | def _freeze_stages(self):
method train (line 269) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/mobilenet_v3.py
class MobileNetV3 (line 15) | class MobileNetV3(BaseBackbone):
method __init__ (line 67) | def __init__(self,
method _make_layer (line 112) | def _make_layer(self):
method init_weights (line 144) | def init_weights(self, pretrained=None):
method forward (line 157) | def forward(self, x):
method _freeze_stages (line 172) | def _freeze_stages(self):
method train (line 182) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/mspn.py
class Bottleneck (line 18) | class Bottleneck(_Bottleneck):
method __init__ (line 32) | def __init__(self, in_channels, out_channels, **kwargs):
class DownsampleModule (line 36) | class DownsampleModule(nn.Module):
method __init__ (line 51) | def __init__(self,
method _make_layer (line 75) | def _make_layer(self, block, out_channels, blocks, stride=1):
method forward (line 102) | def forward(self, x, skip1, skip2):
class UpsampleUnit (line 116) | class UpsampleUnit(nn.Module):
method __init__ (line 138) | def __init__(self,
method forward (line 206) | def forward(self, x, up_x):
class UpsampleModule (line 232) | class UpsampleModule(nn.Module):
method __init__ (line 249) | def __init__(self,
method forward (line 282) | def forward(self, x):
class SingleStageNetwork (line 304) | class SingleStageNetwork(nn.Module):
method __init__ (line 324) | def __init__(self,
method forward (line 351) | def forward(self, x, skip1, skip2):
class ResNetTop (line 358) | class ResNetTop(nn.Module):
method __init__ (line 367) | def __init__(self, norm_cfg=dict(type='BN'), channels=64):
method forward (line 381) | def forward(self, img):
class MSPN (line 386) | class MSPN(BaseBackbone):
method __init__ (line 421) | def __init__(self,
method forward (line 459) | def forward(self, x):
method init_weights (line 471) | def init_weights(self, pretrained=None):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/regnet.py
class RegNet (line 14) | class RegNet(ResNet):
method __init__ (line 88) | def __init__(self,
method _make_stem_layer (line 197) | def _make_stem_layer(self, in_channels, base_channels):
method generate_regnet (line 212) | def generate_regnet(initial_width,
method quantize_float (line 244) | def quantize_float(number, divisor):
method adjust_width_group (line 256) | def adjust_width_group(self, widths, bottleneck_ratio, groups):
method get_stages_from_blocks (line 281) | def get_stages_from_blocks(self, widths):
method forward (line 303) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/resnest.py
class RSoftmax (line 13) | class RSoftmax(nn.Module):
method __init__ (line 21) | def __init__(self, radix, groups):
method forward (line 26) | def forward(self, x):
class SplitAttentionConv2d (line 37) | class SplitAttentionConv2d(nn.Module):
method __init__ (line 56) | def __init__(self,
method norm0 (line 97) | def norm0(self):
method norm1 (line 101) | def norm1(self):
method forward (line 104) | def forward(self, x):
class Bottleneck (line 132) | class Bottleneck(_Bottleneck):
method __init__ (line 162) | def __init__(self,
method forward (line 225) | def forward(self, x):
class ResNeSt (line 260) | class ResNeSt(ResNetV1d):
method __init__ (line 315) | def __init__(self,
method make_res_layer (line 330) | def make_res_layer(self, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/resnext.py
class Bottleneck (line 9) | class Bottleneck(_Bottleneck):
method __init__ (line 34) | def __init__(self,
class ResNeXt (line 90) | class ResNeXt(ResNet):
method __init__ (line 152) | def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
method make_res_layer (line 157) | def make_res_layer(self, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/rsn.py
class RSB (line 14) | class RSB(nn.Module):
method __init__ (line 35) | def __init__(self,
method forward (line 90) | def forward(self, x):
class Downsample_module (line 125) | class Downsample_module(nn.Module):
method __init__ (line 143) | def __init__(self,
method _make_layer (line 180) | def _make_layer(self,
method forward (line 222) | def forward(self, x, skip1, skip2):
class Upsample_unit (line 236) | class Upsample_unit(nn.Module):
method __init__ (line 258) | def __init__(self,
method forward (line 326) | def forward(self, x, up_x):
class Upsample_module (line 352) | class Upsample_module(nn.Module):
method __init__ (line 369) | def __init__(self,
method forward (line 401) | def forward(self, x):
class Single_stage_RSN (line 423) | class Single_stage_RSN(nn.Module):
method __init__ (line 446) | def __init__(self,
method forward (line 477) | def forward(self, x, skip1, skip2):
class ResNet_top (line 484) | class ResNet_top(nn.Module):
method __init__ (line 493) | def __init__(self, norm_cfg=dict(type='BN'), channels=64):
method forward (line 507) | def forward(self, img):
class RSN (line 512) | class RSN(BaseBackbone):
method __init__ (line 549) | def __init__(self,
method forward (line 592) | def forward(self, x):
method init_weights (line 604) | def init_weights(self, pretrained=None):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/scnet.py
class SCConv (line 14) | class SCConv(nn.Module):
method __init__ (line 28) | def __init__(self,
method forward (line 77) | def forward(self, x):
class SCBottleneck (line 90) | class SCBottleneck(Bottleneck):
method __init__ (line 100) | def __init__(self, in_channels, out_channels, **kwargs):
method forward (line 153) | def forward(self, x):
class SCNet (line 192) | class SCNet(ResNet):
method __init__ (line 245) | def __init__(self, depth, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/seresnet.py
class SEBottleneck (line 9) | class SEBottleneck(Bottleneck):
method __init__ (line 18) | def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs):
method forward (line 22) | def forward(self, x):
class SEResNet (line 58) | class SEResNet(ResNet):
method __init__ (line 118) | def __init__(self, depth, se_ratio=16, **kwargs):
method make_res_layer (line 124) | def make_res_layer(self, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/seresnext.py
class SEBottleneck (line 10) | class SEBottleneck(_SEBottleneck):
method __init__ (line 37) | def __init__(self,
class SEResNeXt (line 95) | class SEResNeXt(SEResNet):
method __init__ (line 158) | def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
method make_res_layer (line 163) | def make_res_layer(self, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/shufflenet_v1.py
class ShuffleUnit (line 17) | class ShuffleUnit(nn.Module):
method __init__ (line 47) | def __init__(self,
method _add (line 117) | def _add(x, out):
method _concat (line 122) | def _concat(x, out):
method forward (line 126) | def forward(self, x):
class ShuffleNetV1 (line 157) | class ShuffleNetV1(BaseBackbone):
method __init__ (line 182) | def __init__(self,
method _freeze_stages (line 250) | def _freeze_stages(self):
method init_weights (line 260) | def init_weights(self, pretrained=None):
method make_layer (line 280) | def make_layer(self, out_channels, num_blocks, first_block=False):
method forward (line 309) | def forward(self, x):
method train (line 323) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/shufflenet_v2.py
class InvertedResidual (line 16) | class InvertedResidual(nn.Module):
method __init__ (line 33) | def __init__(self,
method forward (line 113) | def forward(self, x):
class ShuffleNetV2 (line 135) | class ShuffleNetV2(BaseBackbone):
method __init__ (line 158) | def __init__(self,
method _make_layer (line 228) | def _make_layer(self, out_channels, num_blocks):
method _freeze_stages (line 251) | def _freeze_stages(self):
method init_weights (line 262) | def init_weights(self, pretrained=None):
method forward (line 282) | def forward(self, x):
method train (line 296) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/tcn.py
class BasicTemporalBlock (line 13) | class BasicTemporalBlock(nn.Module):
method __init__ (line 38) | def __init__(self,
method forward (line 100) | def forward(self, x):
class TCN (line 133) | class TCN(BaseBackbone):
method __init__ (line 179) | def __init__(self,
method forward (line 245) | def forward(self, x):
method init_weights (line 259) | def init_weights(self, pretrained=None):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/test_torch.py
class Net (line 6) | class Net(nn.Module):
method __init__ (line 8) | def __init__(self):
method forward (line 19) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/channel_shuffle.py
function channel_shuffle (line 5) | def channel_shuffle(x, groups):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/inverted_residual.py
class InvertedResidual (line 11) | class InvertedResidual(nn.Module):
method __init__ (line 41) | def __init__(self,
method forward (line 104) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/make_divisible.py
function make_divisible (line 2) | def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/se_layer.py
class SELayer (line 7) | class SELayer(nn.Module):
method __init__ (line 24) | def __init__(self,
method forward (line 50) | def forward(self, x):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/utils.py
function load_checkpoint (line 7) | def load_checkpoint(model,
function get_state_dict (line 53) | def get_state_dict(filename, map_location='cpu'):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vgg.py
function make_vgg_layer (line 10) | def make_vgg_layer(in_channels,
class VGG (line 39) | class VGG(BaseBackbone):
method __init__ (line 77) | def __init__(self,
method init_weights (line 148) | def init_weights(self, pretrained=None):
method forward (line 159) | def forward(self, x):
method _freeze_stages (line 177) | def _freeze_stages(self):
method train (line 186) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vipnas_mbv3.py
class ViPNAS_MobileNetV3 (line 15) | class ViPNAS_MobileNetV3(BaseBackbone):
method __init__ (line 45) | def __init__(self,
method _make_layer (line 91) | def _make_layer(self):
method init_weights (line 137) | def init_weights(self, pretrained=None):
method forward (line 154) | def forward(self, x):
method _freeze_stages (line 163) | def _freeze_stages(self):
method train (line 173) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vipnas_resnet.py
class ViPNAS_Bottleneck (line 14) | class ViPNAS_Bottleneck(nn.Module):
method __init__ (line 41) | def __init__(self,
method norm1 (line 124) | def norm1(self):
method norm2 (line 129) | def norm2(self):
method norm3 (line 134) | def norm3(self):
method forward (line 138) | def forward(self, x):
function get_expansion (line 175) | def get_expansion(block, expansion=None):
class ViPNAS_ResLayer (line 208) | class ViPNAS_ResLayer(nn.Sequential):
method __init__ (line 238) | def __init__(self,
class ViPNAS_ResNet (line 343) | class ViPNAS_ResNet(BaseBackbone):
method __init__ (line 392) | def __init__(self,
method make_res_layer (line 474) | def make_res_layer(self, **kwargs):
method norm1 (line 479) | def norm1(self):
method _make_stem_layer (line 483) | def _make_stem_layer(self, in_channels, stem_channels, kernel_size):
method _freeze_stages (line 529) | def _freeze_stages(self):
method init_weights (line 548) | def init_weights(self, pretrained=None):
method forward (line 562) | def forward(self, x):
method train (line 581) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vit.py
class DropPath (line 15) | class DropPath(nn.Module):
method __init__ (line 18) | def __init__(self, drop_prob=None):
method forward (line 22) | def forward(self, x):
method extra_repr (line 25) | def extra_repr(self):
class Mlp (line 28) | class Mlp(nn.Module):
method __init__ (line 29) | def __init__(self, in_features, hidden_features=None, out_features=Non...
method forward (line 38) | def forward(self, x):
class Attention (line 45) | class Attention(nn.Module):
method __init__ (line 46) | def __init__(
method forward (line 66) | def forward(self, x):
class Block (line 84) | class Block(nn.Module):
method __init__ (line 86) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
method forward (line 104) | def forward(self, x):
class PatchEmbed (line 110) | class PatchEmbed(nn.Module):
method __init__ (line 113) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
method forward (line 126) | def forward(self, x, **kwargs):
class HybridEmbed (line 135) | class HybridEmbed(nn.Module):
method __init__ (line 139) | def __init__(self, backbone, img_size=224, feature_size=None, in_chans...
method forward (line 160) | def forward(self, x):
class ViT (line 168) | class ViT(nn.Module):
method __init__ (line 170) | def __init__(self,
method _freeze_stages (line 216) | def _freeze_stages(self):
method init_weights (line 253) | def init_weights(self, pretrained=None):
method get_num_layers (line 273) | def get_num_layers(self):
method no_weight_decay (line 277) | def no_weight_decay(self):
method forward_features (line 280) | def forward_features(self, x):
method forward (line 301) | def forward(self, x):
method train (line 305) | def train(self, mode=True):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/configs/coco/ViTPose_base_coco_256x192.py
function make_cfg (line 165) | def make_cfg(model=model,data_cfg=data_cfg):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/deconv_head.py
class DeconvHead (line 12) | class DeconvHead(nn.Module):
method __init__ (line 40) | def __init__(self,
method _init_inputs (line 129) | def _init_inputs(self, in_channels, in_index, input_transform):
method _transform_inputs (line 169) | def _transform_inputs(self, inputs):
method _make_deconv_layer (line 198) | def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
method _get_deconv_cfg (line 232) | def _get_deconv_cfg(deconv_kernel):
method get_loss (line 248) | def get_loss(self, outputs, targets, masks):
method forward (line 275) | def forward(self, x):
method init_weights (line 284) | def init_weights(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/deeppose_regression_head.py
class DeepposeRegressionHead (line 13) | class DeepposeRegressionHead(nn.Module):
method __init__ (line 24) | def __init__(self,
method forward (line 42) | def forward(self, x):
method get_loss (line 48) | def get_loss(self, output, target, target_weight):
method get_accuracy (line 69) | def get_accuracy(self, output, target, target_weight):
method inference_model (line 97) | def inference_model(self, x, flip_pairs=None):
method decode (line 117) | def decode(self, img_metas, output, **kwargs):
method init_weights (line 175) | def init_weights(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/hmr_head.py
class HMRMeshHead (line 12) | class HMRMeshHead(nn.Module):
method __init__ (line 22) | def __init__(self, in_channels, smpl_mean_params=None, n_iter=3):
method forward (line 58) | def forward(self, x):
method init_weights (line 90) | def init_weights(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/interhand_3d_head.py
class Heatmap3DHead (line 17) | class Heatmap3DHead(nn.Module):
method __init__ (line 33) | def __init__(self,
method _make_deconv_layer (line 117) | def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
method _get_deconv_cfg (line 151) | def _get_deconv_cfg(deconv_kernel):
method forward (line 167) | def forward(self, x):
method init_weights (line 176) | def init_weights(self):
class Heatmap1DHead (line 190) | class Heatmap1DHead(nn.Module):
method __init__ (line 200) | def __init__(self, in_channels=2048, heatmap_size=64, hidden_dims=(512...
method soft_argmax_1d (line 209) | def soft_argmax_1d(self, heatmap1d):
method _make_linear_layers (line 217) | def _make_linear_layers(self, feat_dims, relu_final=False):
method forward (line 227) | def forward(self, x):
method init_weights (line 233) | def init_weights(self):
class MultilabelClassificationHead (line 240) | class MultilabelClassificationHead(nn.Module):
method __init__ (line 250) | def __init__(self, in_channels=2048, num_labels=2, hidden_dims=(512, )):
method _make_linear_layers (line 259) | def _make_linear_layers(self, feat_dims, relu_final=False):
method forward (line 269) | def forward(self, x):
method init_weights (line 274) | def init_weights(self):
class Interhand3DHead (line 281) | class Interhand3DHead(nn.Module):
method __init__ (line 300) | def __init__(self,
method init_weights (line 327) | def init_weights(self):
method get_loss (line 333) | def get_loss(self, output, target, target_weight):
method get_accuracy (line 365) | def get_accuracy(self, output, target, target_weight):
method forward (line 383) | def forward(self, x):
method inference_model (line 394) | def inference_model(self, x, flip_pairs=None):
method decode (line 443) | def decode(self, img_metas, output, **kwargs):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/temporal_regression_head.py
class TemporalRegressionHead (line 13) | class TemporalRegressionHead(nn.Module):
method __init__ (line 31) | def __init__(self,
method _transform_inputs (line 63) | def _transform_inputs(x):
method forward (line 80) | def forward(self, x):
method get_loss (line 89) | def get_loss(self, output, target, target_weight):
method get_accuracy (line 128) | def get_accuracy(self, output, target, target_weight, metas):
method inference_model (line 202) | def inference_model(self, x, flip_pairs=None):
method decode (line 225) | def decode(self, metas, output):
method _denormalize_joints (line 265) | def _denormalize_joints(x, mean, std):
method _restore_global_position (line 279) | def _restore_global_position(x, root_pos, root_idx=None):
method _restore_root_target_weight (line 295) | def _restore_root_target_weight(target_weight, root_weight, root_idx=N...
method init_weights (line 313) | def init_weights(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/topdown_heatmap_base_head.py
class TopdownHeatmapBaseHead (line 10) | class TopdownHeatmapBaseHead(nn.Module):
method get_loss (line 25) | def get_loss(self, **kwargs):
method get_accuracy (line 29) | def get_accuracy(self, **kwargs):
method forward (line 33) | def forward(self, **kwargs):
method inference_model (line 37) | def inference_model(self, **kwargs):
method decode (line 40) | def decode(self, img_metas, output, **kwargs):
method _get_deconv_cfg (line 106) | def _get_deconv_cfg(deconv_kernel):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/topdown_heatmap_multi_stage_head.py
class TopdownHeatmapMultiStageHead (line 18) | class TopdownHeatmapMultiStageHead(TopdownHeatmapBaseHead):
method __init__ (line 38) | def __init__(self,
method get_loss (line 109) | def get_loss(self, output, target, target_weight):
method get_accuracy (line 150) | def get_accuracy(self, output, target, target_weight):
method forward (line 177) | def forward(self, x):
method inference_model (line 191) | def inference_model(self, x, flip_pairs=None):
method _make_deconv_layer (line 220) | def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
method init_weights (line 253) | def init_weights(self):
class PredictHeatmap (line 265) | class PredictHeatmap(nn.Module):
method __init__ (line 277) | def __init__(self,
method forward (line 311) | def forward(self, feature):
class PRM (line 320) | class PRM(nn.Module):
method __init__ (line 333) | def __init__(self, out_channels, norm_cfg=dict(type='BN')):
method forward (line 374) | def forward(self, x):
class TopdownHeatmapMSMUHead (line 391) | class TopdownHeatmapMSMUHead(TopdownHeatmapBaseHead):
method __init__ (line 408) | def __init__(self,
method get_loss (line 446) | def get_loss(self, output, target, target_weight):
method get_accuracy (line 488) | def get_accuracy(self, output, target, target_weight):
method forward (line 518) | def forward(self, x):
method inference_model (line 538) | def inference_model(self, x, flip_pairs=None):
method init_weights (line 564) | def init_weights(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/topdown_heatmap_simple_head.py
function build_conv_layer (line 15) | def build_conv_layer(cfg, *args, **kwargs) -> nn.Module:
function build_upsample_layer (line 37) | def build_upsample_layer(cfg, *args, **kwargs) -> nn.Module:
class TopdownHeatmapSimpleHead (line 58) | class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead):
method __init__ (line 90) | def __init__(self,
method get_loss (line 188) | def get_loss(self, output, target, target_weight):
method get_accuracy (line 212) | def get_accuracy(self, output, target, target_weight):
method forward (line 239) | def forward(self, x):
method inference_model (line 246) | def inference_model(self, x, flip_pairs=None):
method _init_inputs (line 271) | def _init_inputs(self, in_channels, in_index, input_transform):
method _transform_inputs (line 311) | def _transform_inputs(self, inputs):
method _make_deconv_layer (line 348) | def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
method init_weights (line 381) | def init_weights(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/vipnas_heatmap_simple_head.py
class ViPNASHeatmapSimpleHead (line 16) | class ViPNASHeatmapSimpleHead(TopdownHeatmapBaseHead):
method __init__ (line 52) | def __init__(self,
method get_loss (line 146) | def get_loss(self, output, target, target_weight):
method get_accuracy (line 170) | def get_accuracy(self, output, target, target_weight):
method forward (line 197) | def forward(self, x):
method inference_model (line 204) | def inference_model(self, x, flip_pairs=None):
method _init_inputs (line 229) | def _init_inputs(self, in_channels, in_index, input_transform):
method _transform_inputs (line 269) | def _transform_inputs(self, inputs):
method _make_deconv_layer (line 298) | def _make_deconv_layer(self, num_layers, num_filters, num_kernels,
method init_weights (line 338) | def init_weights(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/voxelpose_head.py
class CuboidCenterHead (line 15) | class CuboidCenterHead(nn.Module):
method __init__ (line 28) | def __init__(self,
method _get_real_locations (line 44) | def _get_real_locations(self, indices):
method _nms_by_max_pool (line 58) | def _nms_by_max_pool(self, heatmap_volumes):
method _max_pool (line 69) | def _max_pool(self, inputs):
method _get_3d_indices (line 78) | def _get_3d_indices(indices, shape):
method forward (line 98) | def forward(self, heatmap_volumes):
method get_loss (line 122) | def get_loss(self, pred_cubes, gt):
class CuboidPoseHead (line 128) | class CuboidPoseHead(nn.Module):
method __init__ (line 130) | def __init__(self, beta):
method forward (line 144) | def forward(self, heatmap_volumes, grid_coordinates):
method get_loss (line 165) | def get_loss(self, preds, targets, weights):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/model_builder.py
function build_model (line 17) | def build_model(model_name, checkpoint=None):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/model_builder.py
function build_model (line 121) | def build_model(model_name, checkpoint=None):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/ViTPose_trt.py
function torch_device_from_trt (line 6) | def torch_device_from_trt(device):
function torch_dtype_from_trt (line 13) | def torch_dtype_from_trt(dtype):
class TRTModule_ViTPose (line 26) | class TRTModule_ViTPose(torch.nn.Module):
method __init__ (line 27) | def __init__(self, engine=None, input_names=None, output_names=None, i...
method forward (line 68) | def forward(self, *inputs):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/general_utils.py
function make_parser (line 17) | def make_parser():
function jitter (line 32) | def jitter(tracking,temp,id1):
function jitter2 (line 34) | def jitter2(tracking,temp,id1) :
function create_json_rabbitmq (line 37) | def create_json_rabbitmq( FRAME_ID,pose):
function producer_rabbitmq (line 40) | def producer_rabbitmq():
function fix_head (line 42) | def fix_head(xyz):
function flatten_lst (line 45) | def flatten_lst(x):
function polys_from_pose (line 51) | def polys_from_pose(pts):
function fix_list_order (line 86) | def fix_list_order(list_,list2):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/logger_helper.py
class CustomFormatter (line 3) | class CustomFormatter(logging.Formatter):
method format (line 20) | def format(self, record):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/pose_utils.py
function pose_points_yolo5 (line 25) | def pose_points_yolo5(detector,image,pose,tracker,tensorrt):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/pose_viz.py
function joints_dict (line 9) | def joints_dict():
function draw_points (line 71) | def draw_points(image, points, color_palette='tab20', palette_samples=16...
function draw_skeleton (line 110) | def draw_skeleton(image, points, skeleton, color_palette='Set2', palette...
function draw_points_and_skeleton (line 156) | def draw_points_and_skeleton(image, points, skeleton, points_color_palet...
function save_images (line 195) | def save_images(images, target, joint_target, output, joint_output, join...
function check_video_rotation (line 271) | def check_video_rotation(filename):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/timerr.py
class Timer (line 4) | class Timer(object):
method __init__ (line 6) | def __init__(self):
method tic (line 15) | def tic(self):
method toc (line 20) | def toc(self, average=True):
method clear (line 31) | def clear(self):
FILE: eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/visualizer.py
function vis (line 7) | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
function get_color (line 41) | def get_color(idx):
function plot_tracking (line 48) | def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0....
FILE: eval/GVHMR/hmr4d/utils/pylogger.py
function sync_time (line 7) | def sync_time():
function timer (line 29) | def timer(sync_cuda=False, mem=False, loop=1):
function timed (line 68) | def timed(fn):
FILE: eval/GVHMR/hmr4d/utils/seq_utils.py
function get_frame_id_list_from_mask (line 27) | def get_frame_id_list_from_mask(mask):
function get_batch_frame_id_lists_from_mask_BLC (line 56) | def get_batch_frame_id_lists_from_mask_BLC(masks):
function get_frame_id_list_from_frame_id (line 96) | def get_frame_id_list_from_frame_id(frame_id):
function rearrange_by_mask (line 103) | def rearrange_by_mask(x, mask):
function frame_id_to_mask (line 119) | def frame_id_to_mask(frame_id, max_len):
function mask_to_frame_id (line 125) | def mask_to_frame_id(mask):
function linear_interpolate_frame_ids (line 130) | def linear_interpolate_frame_ids(data, frame_id_list):
function linear_interpolate (line 149) | def linear_interpolate(data, N_middle_frames):
function find_top_k_span (line 163) | def find_top_k_span(mask, k=3):
FILE: eval/GVHMR/hmr4d/utils/smplx_utils.py
function make_smplx (line 18) | def make_smplx(type="neu_fullpose", **kwargs):
function load_parents (line 172) | def load_parents(npz_path="models/smplx/SMPLX_NEUTRAL.npz"):
function load_smpl_faces (line 179) | def load_smpl_faces(npz_path="models/smplh/SMPLH_FEMALE.pkl"):
function decompose_fullpose (line 186) | def decompose_fullpose(fullpose, model_type="smplx"):
function compose_fullpose (line 202) | def compose_fullpose(fullpose_dict, model_type="smplx"):
function compute_R_from_kinetree (line 222) | def compute_R_from_kinetree(rot_mats, parents):
function compute_relR_from_kinetree (line 245) | def compute_relR_from_kinetree(R, parents):
function quat_mul (line 268) | def quat_mul(x, y):
function quat_inv (line 301) | def quat_inv(q):
function quat_mul_vec (line 313) | def quat_mul_vec(q, x):
function inverse_kinematics_motion (line 329) | def inverse_kinematics_motion(
function transform_mat (line 352) | def transform_mat(R, t):
function normalize_joints (line 364) | def normalize_joints(joints):
function compute_Rt_af2az (line 383) | def compute_Rt_af2az(joints, inverse=False):
function finite_difference_forward (line 411) | def finite_difference_forward(x, dim_t=1, dup_last=True):
function compute_joints_zero (line 422) | def compute_joints_zero(betas, gender):
FILE: eval/GVHMR/hmr4d/utils/video_io_utils.py
function get_video_lwh (line 11) | def get_video_lwh(video_path):
function read_video_np (line 16) | def read_video_np(video_path, start_frame=0, end_frame=-1, scale=1.0):
function get_video_reader (line 47) | def get_video_reader(video_path):
function read_images_np (line 51) | def read_images_np(image_paths, verbose=False):
function save_video (line 66) | def save_video(images, video_path, fps=30, crf=17):
function get_writer (line 84) | def get_writer(video_path, fps=30, crf=17):
function copy_file (line 92) | def copy_file(video_path, out_video_path, overwrite=True):
function merge_videos_horizontal (line 98) | def merge_videos_horizontal(in_video_paths: list, out_video_path: str):
function merge_videos_vertical (line 107) | def merge_videos_vertical(in_video_paths: list, out_video_path: str):
FILE: eval/GVHMR/hmr4d/utils/vis/cv2_utils.py
function to_numpy (line 7) | def to_numpy(x):
function draw_bbx_xys_on_image (line 15) | def draw_bbx_xys_on_image(bbx_xys, image, conf=True):
function draw_bbx_xys_on_image_batch (line 26) | def draw_bbx_xys_on_image_batch(bbx_xys_batch, image_batch, conf=None):
function draw_bbx_xyxy_on_image (line 40) | def draw_bbx_xyxy_on_image(bbx_xys, image, conf=True):
function draw_bbx_xyxy_on_image_batch (line 48) | def draw_bbx_xyxy_on_image_batch(bbx_xyxy_batch, image_batch, mask=None,...
function draw_kpts (line 74) | def draw_kpts(frame, keypoints, color=(0, 255, 0), thickness=2):
function draw_kpts_with_conf (line 81) | def draw_kpts_with_conf(frame, kp2d, conf, thickness=2):
function draw_kpts_with_conf_batch (line 98) | def draw_kpts_with_conf_batch(frames, kp2d_batch, conf_batch, thickness=2):
function draw_coco17_skeleton (line 112) | def draw_coco17_skeleton(img, keypoints, conf_thr=0):
function draw_coco17_skeleton_batch (line 138) | def draw_coco17_skeleton_batch(imgs, keypoints_batch, conf_thr=0):
FILE: eval/GVHMR/hmr4d/utils/vis/renderer.py
function overlay_image_onto_background (line 29) | def overlay_image_onto_background(image, mask, bbox, background):
function update_intrinsics_from_bbox (line 45) | def update_intrinsics_from_bbox(K_org, bbox):
function perspective_projection (line 73) | def perspective_projection(x3d, K, R=None, T=None):
function compute_bbox_from_points (line 84) | def compute_bbox_from_points(X, img_w, img_h, scaleFactor=1.2):
class Renderer (line 105) | class Renderer:
method __init__ (line 106) | def __init__(self, width, height, focal_length=None, device="cuda", fa...
method create_renderer (line 123) | def create_renderer(self):
method create_camera (line 136) | def create_camera(self, R=None, T=None):
method initialize_camera_params (line 146) | def initialize_camera_params(self, focal_length, K):
method set_intrinsic (line 167) | def set_intrinsic(self, K):
method set_ground (line 170) | def set_ground(self, length, center_x, center_z):
method update_bbox (line 177) | def update_bbox(self, x3d, scale=2.0, mask=None):
method reset_bbox (line 198) | def reset_bbox(
method render_mesh (line 210) | def render_mesh(self, vertices, background=None, colors=[0.8, 0.8, 0.8...
method render_with_ground (line 244) | def render_with_ground(self, verts, colors, cameras, lights, faces=None):
function create_meshes (line 278) | def create_meshes(verts, faces, colors):
function get_global_cameras (line 289) | def get_global_cameras(verts, device="cuda", distance=5, position=(-5.0,...
function get_global_cameras_static (line 305) | def get_global_cameras_static(
function get_ground_params_from_points (line 343) | def get_ground_params_from_points(root_points, vert_points):
FILE: eval/GVHMR/hmr4d/utils/vis/renderer_tools.py
function read_image (line 8) | def read_image(path, scale=1):
function transform_torch3d (line 17) | def transform_torch3d(T_c2w):
function transform_pyrender (line 44) | def transform_pyrender(T_c2w):
function smpl_to_geometry (line 60) | def smpl_to_geometry(verts, faces, vis_mask=None, track_ids=None):
function filter_visible_meshes (line 79) | def filter_visible_meshes(verts, colors, faces, vis_mask=None, vis_opaci...
function get_bboxes (line 112) | def get_bboxes(verts, vis_mask):
function track_to_colors (line 137) | def track_to_colors(track_ids):
function get_colors (line 145) | def get_colors():
function checkerboard_geometry (line 161) | def checkerboard_geometry(
function camera_marker_geometry (line 214) | def camera_marker_geometry(radius, height, up):
function vis_keypoints (line 261) | def vis_keypoints(
function imshow_keypoints (line 714) | def imshow_keypoints(
FILE: eval/GVHMR/hmr4d/utils/vis/renderer_utils.py
function simple_render_mesh (line 6) | def simple_render_mesh(render_dict):
function simple_render_mesh_background (line 21) | def simple_render_mesh_background(render_dict, VI=50, colors=[0.8, 0.8, ...
FILE: eval/GVHMR/hmr4d/utils/vis/rich_logger.py
function print_cfg (line 10) | def print_cfg(cfg: DictConfig, use_rich: bool = False):
FILE: eval/GVHMR/hmr4d/utils/wis3d_utils.py
function make_wis3d (line 10) | def make_wis3d(output_dir="outputs/wis3d", name="debug", time_postfix=Fa...
function get_gradient_colors (line 38) | def get_gradient_colors(scheme="red", num_points=120, alpha=1.0):
function get_const_colors (line 48) | def get_const_colors(name="red", partial_shape=(120, 5), alpha=1.0):
function get_colors_by_conf (line 58) | def get_colors_by_conf(conf, low="red", high="green"):
function convert_motion_as_line_mesh (line 92) | def convert_motion_as_line_mesh(motion, skeleton_type="smpl22", const_co...
function add_motion_as_lines (line 124) | def add_motion_as_lines(motion, wis3d, name="joints22", skeleton_type="s...
function add_prog_motion_as_lines (line 139) | def add_prog_motion_as_lines(motion, wis3d, name="joints22", skeleton_ty...
function add_joints_motion_as_spheres (line 174) | def add_joints_motion_as_spheres(joints, wis3d, radius=0.05, name="joint...
function create_skeleton_mesh (line 208) | def create_skeleton_mesh(p1, p2, radius, color, resolution=4, return_mer...
function get_lines_of_my_frustum (line 278) | def get_lines_of_my_frustum(frustum_points):
function draw_colored_vec (line 287) | def draw_colored_vec(wis3d, vec, name, radius=0.02, colors="r", starts=N...
function draw_T_w2c (line 315) | def draw_T_w2c(wis3d, T_w2c, name, radius=0.01, all_in_one=True, l=0.1):
function create_checkerboard_mesh (line 335) | def create_checkerboard_mesh(y=0.0, grid_size=1.0, bounds=((-3, -3), (3,...
function add_a_trimesh (line 393) | def add_a_trimesh(mesh, wis3d, name):
FILE: eval/GVHMR/tools/demo/demo.py
function parse_args_to_cfg (line 40) | def parse_args_to_cfg():
function run_preprocess (line 88) | def run_preprocess(cfg):
function load_data_dict (line 157) | def load_data_dict(cfg):
function render_incam (line 180) | def render_incam(cfg):
function render_global (line 222) | def render_global(cfg):
FILE: eval/GVHMR/tools/eval_pose.py
function batch_rotation_matrix_angle_error (line 32) | def batch_rotation_matrix_angle_error(R1_batch, R2_batch):
function normalize (line 51) | def normalize(x):
function viewmatrix (line 54) | def viewmatrix(z, up, pos):
function matrix_to_euler_angles (line 62) | def matrix_to_euler_angles(matrix):
function eul2rot (line 77) | def eul2rot(theta) :
function extract_location_rotation (line 85) | def extract_location_rotation(data):
function parse_matrix (line 97) | def parse_matrix(matrix_str):
function batch_axis_angle_to_rotation_matrix (line 105) | def batch_axis_angle_to_rotation_matrix(r_batch):
FILE: eval/GVHMR/tools/train.py
function get_callbacks (line 12) | def get_callbacks(cfg: DictConfig) -> list:
function train (line 31) | def train(cfg: DictConfig) -> None:
function main (line 80) | def main(cfg) -> None:
FILE: eval/GVHMR/tools/unitest/run_dataset.py
function get_dataset (line 6) | def get_dataset(DATA_TYPE):
FILE: eval/GVHMR/tools/video/merge_folder.py
function main (line 8) | def main():
FILE: eval/GVHMR/tools/video/merge_horizontal.py
function parse_args (line 5) | def parse_args():
FILE: eval/GVHMR/tools/video/merge_vertical.py
function parse_args (line 5) | def parse_args():
FILE: eval/common_metrics_on_video_quality/calculate_clip.py
function get_video_scores (line 21) | def get_video_scores(video_path, prompt):
FILE: eval/common_metrics_on_video_quality/calculate_fvd.py
function trans (line 5) | def trans(x):
function calculate_fvd (line 15) | def calculate_fvd(videos1, videos2, device, method='styleganv'):
function main (line 66) | def main():
FILE: eval/common_metrics_on_video_quality/calculate_lpips.py
function trans (line 15) | def trans(x):
function calculate_lpips (line 25) | def calculate_lpips(videos1, videos2, device):
function main (line 82) | def main():
FILE: eval/common_metrics_on_video_quality/calculate_psnr.py
function img_psnr (line 6) | def img_psnr(img1, img2):
function trans (line 17) | def trans(x):
function calculate_psnr (line 20) | def calculate_psnr(videos1, videos2):
function main (line 72) | def main():
FILE: eval/common_metrics_on_video_quality/calculate_ssim.py
function ssim (line 6) | def ssim(img1, img2):
function calculate_ssim_function (line 26) | def calculate_ssim_function(img1, img2):
function trans (line 44) | def trans(x):
function calculate_ssim (line 47) | def calculate_ssim(videos1, videos2):
function main (line 99) | def main():
Condensed preview — 276 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (7,387K chars).
[
{
"path": "CogVideo/.github/ISSUE_TEMPLATE/bug_report.yaml",
"chars": 2141,
"preview": "name: \"\\U0001F41B Bug Report\"\ndescription: Submit a bug report to help us improve CogVideoX / 提交一个 Bug 问题报告来帮助我们改进 CogVi"
},
{
"path": "CogVideo/.github/ISSUE_TEMPLATE/feature-request.yaml",
"chars": 1014,
"preview": "name: \"\\U0001F680 Feature request\"\ndescription: Submit a request for a new CogVideoX feature / 提交一个新的 CogVideoX开源模型的功能建议"
},
{
"path": "CogVideo/.github/PULL_REQUEST_TEMPLATE/pr_template.md",
"chars": 1425,
"preview": "# Raise valuable PR / 提出有价值的PR\n\n## Caution / 注意事项:\nUsers should keep the following points in mind when submitting PRs:\n"
},
{
"path": "CogVideo/.gitignore",
"chars": 89,
"preview": "*__pycache__/\nsamples*/\nruns/\ncheckpoints/\nmaster_ip\nlogs/\n*.DS_Store\n.idea\noutput*\ntest*"
},
{
"path": "CogVideo/LICENSE",
"chars": 11360,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "CogVideo/MODEL_LICENSE",
"chars": 4069,
"preview": "The CogVideoX License\n\n1. Definitions\n\n“Licensor” means the CogVideoX Model Team that distributes its Software.\n\n“Softwa"
},
{
"path": "CogVideo/README.md",
"chars": 24449,
"preview": "# CogVideo & CogVideoX\n\n[中文阅读](./README_zh.md)\n\n[日本語で読む](./README_ja.md)\n\n<div align=\"center\">\n<img src=resources/logo.s"
},
{
"path": "CogVideo/README_ja.md",
"chars": 18477,
"preview": "# CogVideo & CogVideoX\n\n[Read this in English](./README_zh.md)\n\n[中文阅读](./README_zh.md)\n\n<div align=\"center\">\n<img src=re"
},
{
"path": "CogVideo/README_zh.md",
"chars": 16633,
"preview": "# CogVideo & CogVideoX\n\n[Read this in English](./README_zh.md)\n\n[日本語で読む](./README_ja.md)\n\n\n<div align=\"center\">\n<img src"
},
{
"path": "CogVideo/download.sh",
"chars": 272,
"preview": "mkdir CogVideoX-2b-sat\ncd CogVideoX-2b-sat\nwget https://cloud.tsinghua.edu.cn/f/fdba7608a49c463ba754/?dl=1\nmv 'index.htm"
},
{
"path": "CogVideo/finetune/README.md",
"chars": 5997,
"preview": "# CogVideoX diffusers Fine-tuning Guide\n\n[中文阅读](./README_zh.md)\n\n[日本語で読む](./README_ja.md)\n\nThis feature is not fully com"
},
{
"path": "CogVideo/finetune/README_ja.md",
"chars": 3982,
"preview": "# CogVideoX diffusers 微調整方法\n\n[Read this in English.](./README_zh)\n\n[中文阅读](./README_zh.md)\n\n\nこの機能はまだ完全に完成していません。SATバージョンの"
},
{
"path": "CogVideo/finetune/README_zh.md",
"chars": 3598,
"preview": "# CogVideoX diffusers 微调方案\n\n[Read this in English](./README_zh.md)\n\n[日本語で読む](./README_ja.md)\n\n本功能尚未完全完善,如果您想查看SAT版本微调,请查"
},
{
"path": "CogVideo/finetune/accelerate_config_machine_single.yaml",
"chars": 532,
"preview": "compute_environment: LOCAL_MACHINE\ndebug: false\ndeepspeed_config:\n gradient_accumulation_steps: 1\n gradient_clipping: "
},
{
"path": "CogVideo/finetune/accelerate_config_machine_single_debug.yaml",
"chars": 532,
"preview": "compute_environment: LOCAL_MACHINE\ndebug: false\ndeepspeed_config:\n gradient_accumulation_steps: 1\n gradient_clipping: "
},
{
"path": "CogVideo/finetune/finetune_single_rank_injector.sh",
"chars": 2207,
"preview": "#!/bin/bash\n\nexport MODEL_PATH=\"/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b\" # Change it to CogVideoX-5B"
},
{
"path": "CogVideo/finetune/finetune_single_rank_lora.sh",
"chars": 1850,
"preview": "#!/bin/bash\n\nexport MODEL_PATH=\"/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b\" # Change it to CogVideoX-5B"
},
{
"path": "CogVideo/finetune/hostfile.txt",
"chars": 27,
"preview": "node1 slots=8\nnode2 slots=8"
},
{
"path": "CogVideo/finetune/models/attention.py",
"chars": 53258,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "CogVideo/finetune/models/attention_processor.py",
"chars": 185661,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "CogVideo/finetune/models/cogvideox_transformer_3d.py",
"chars": 25367,
"preview": "# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.\n# All rights reserved.\n#\n# "
},
{
"path": "CogVideo/finetune/models/embeddings.py",
"chars": 73471,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "CogVideo/finetune/models/pipeline_cogvideox.py",
"chars": 39244,
"preview": "# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.\n# All rights reserved.\n#\n# "
},
{
"path": "CogVideo/finetune/models/pipeline_output.py",
"chars": 616,
"preview": "from dataclasses import dataclass\n\nimport torch\n\nfrom diffusers.utils import BaseOutput\n\n\n@dataclass\nclass CogVideoXPipe"
},
{
"path": "CogVideo/finetune/models/utils.py",
"chars": 5098,
"preview": "import os\nfrom typing import Callable, Dict, List, Optional, Union\n\nimport torch\nfrom huggingface_hub.utils import valid"
},
{
"path": "CogVideo/finetune/train_cogvideox_injector.py",
"chars": 64223,
"preview": "\"\"\"\nAdapted from CogVideoX-5B: https://github.com/THUDM/CogVideo by Xiao Fu (CUHK)\n\"\"\"\n\nimport argparse\nimport logging\ni"
},
{
"path": "CogVideo/finetune/train_cogvideox_lora.py",
"chars": 63147,
"preview": "\"\"\"\nAdapted from CogVideoX-5B: https://github.com/THUDM/CogVideo by Xiao Fu (CUHK)\n\"\"\"\n\nimport argparse\nimport logging\ni"
},
{
"path": "CogVideo/inference/3dtrajmaster_inference.py",
"chars": 11195,
"preview": "\"\"\"\nAdapted from CogVideoX-5B: https://github.com/THUDM/CogVideo by Xiao Fu (CUHK)\n\"\"\"\n\nimport argparse\nfrom typing impo"
},
{
"path": "CogVideo/inference/entity_zoo.txt",
"chars": 9422,
"preview": "[\n \"a fire spirit with long, twisting flames resembling flowing red and orange hair, a bright yellow core\",\n \"a ge"
},
{
"path": "CogVideo/inference/location_zoo.txt",
"chars": 599,
"preview": "[\n 'fjord',\n 'sunset beach',\n 'cave',\n 'snowy tundra',\n 'prairie',\n 'asian town',\n 'rainforest',\n "
},
{
"path": "CogVideo/pyproject.toml",
"chars": 699,
"preview": "[tool.ruff]\nline-length = 119\n\n[tool.ruff.lint]\n# Never enforce `E501` (line length violations).\nignore = [\"C901\", \"E501"
},
{
"path": "CogVideo/requirements.txt",
"chars": 300,
"preview": "diffusers==0.31.0\naccelerate==1.1.1\ntransformers==4.46.2\nnumpy==1.26.0\n# torch==2.5.0\n# torchvision==0.20.0\nsentencepiec"
},
{
"path": "CogVideo/tools/caption/README.md",
"chars": 2788,
"preview": "# Video Caption\n\nTypically, most video data does not come with corresponding descriptive text, so it is necessary to con"
},
{
"path": "CogVideo/tools/caption/README_ja.md",
"chars": 2373,
"preview": "# ビデオキャプション\n\n通常、ほとんどのビデオデータには対応する説明文が付いていないため、ビデオデータをテキストの説明に変換して、テキストからビデオへのモデルに必要なトレーニングデータを提供する必要があります。\n\n## 更新とニュース\n-"
},
{
"path": "CogVideo/tools/caption/README_zh.md",
"chars": 2277,
"preview": "# 视频Caption\n\n通常,大多数视频数据不带有相应的描述性文本,因此需要将视频数据转换为文本描述,以提供必要的训练数据用于文本到视频模型。\n\n## 项目更新\n- 🔥🔥 **News**: ```2024/9/19```: CogVid"
},
{
"path": "CogVideo/tools/caption/requirements.txt",
"chars": 398,
"preview": "decord>=0.6.0\n#根据https://download.pytorch.org/whl/torch/,python版本为[3.8,3.11]\ntorch==2.1.0\ntorchvision== 0.16.0\npytorchvi"
},
{
"path": "CogVideo/tools/caption/video_caption.py",
"chars": 3450,
"preview": "import io\n\nimport argparse\nimport numpy as np\nimport torch\nfrom decord import cpu, VideoReader, bridge\nfrom transformers"
},
{
"path": "CogVideo/tools/convert_weight_sat2hf.py",
"chars": 12106,
"preview": "\"\"\"\nThis script demonstrates how to convert and generate video from a text prompt\nusing CogVideoX with 🤗Huggingface Diff"
},
{
"path": "CogVideo/tools/export_sat_lora_weight.py",
"chars": 2704,
"preview": "from typing import Any, Dict\nimport torch \nimport argparse \nfrom diffusers.loaders.lora_base import LoraBaseMixin\nfrom d"
},
{
"path": "CogVideo/tools/llm_flux_cogvideox/generate.sh",
"chars": 1093,
"preview": "#!/bin/bash\n\nNUM_VIDEOS=10\nINFERENCE_STEPS=50\nGUIDANCE_SCALE=7.0\nOUTPUT_DIR_PREFIX=\"outputs/gpu_\"\nLOG_DIR_PREFIX=\"logs/g"
},
{
"path": "CogVideo/tools/llm_flux_cogvideox/gradio_page.py",
"chars": 7406,
"preview": "import os\nimport gradio as gr\nimport gc\nimport random\nimport torch\nimport numpy as np\nfrom PIL import Image\nimport trans"
},
{
"path": "CogVideo/tools/llm_flux_cogvideox/llm_flux_cogvideox.py",
"chars": 9461,
"preview": "\"\"\"\nThe original experimental code for this project can be found at:\n\nhttps://gist.github.com/a-r-r-o-w/d070cce059ab4cea"
},
{
"path": "CogVideo/tools/load_cogvideox_lora.py",
"chars": 4752,
"preview": "# Copyright 2024 The HuggingFace Team.\n# All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "CogVideo/tools/parallel_inference/parallel_inference_xdit.py",
"chars": 3685,
"preview": "\"\"\"\nThis is a parallel inference script for CogVideo. The original script\ncan be found from the xDiT project at\n\nhttps:/"
},
{
"path": "CogVideo/tools/parallel_inference/run.sh",
"chars": 1338,
"preview": "set -x\n\nexport PYTHONPATH=$PWD:$PYTHONPATH\n\n# Select the model type\n# The model is downloaded to a specified location on"
},
{
"path": "CogVideo/tools/replicate/cog.yaml",
"chars": 1022,
"preview": "# Configuration for Cog ⚙️\n# Reference: https://cog.run/yaml\n\nbuild:\n # set to true if your model requires a GPU\n gpu:"
},
{
"path": "CogVideo/tools/replicate/predict_i2v.py",
"chars": 2865,
"preview": "# Prediction interface for Cog ⚙️\n# https://cog.run/python\n\nimport os\nimport subprocess\nimport time\nimport torch\nfrom di"
},
{
"path": "CogVideo/tools/replicate/predict_t2v.py",
"chars": 3285,
"preview": "# Prediction interface for Cog ⚙️\n# https://cog.run/python\n\nimport os\nimport subprocess\nimport time\nimport torch\nfrom di"
},
{
"path": "CogVideo/tools/venhancer/README.md",
"chars": 8127,
"preview": "# Enhance CogVideoX Generated Videos with VEnhancer\n\nThis tutorial will guide you through using the VEnhancer tool to en"
},
{
"path": "CogVideo/tools/venhancer/README_ja.md",
"chars": 7227,
"preview": "\n# VEnhancer で CogVideoX によって生成されたビデオを強化する\n\nこのチュートリアルでは、VEnhancer ツールを使用して、CogVideoX で生成されたビデオを強化し、より高いフレームレートと高い解像度を実現す"
},
{
"path": "CogVideo/tools/venhancer/README_zh.md",
"chars": 6892,
"preview": "# 使用 VEnhancer 对 CogVdieoX 生成视频进行增强\n\n本教程将要使用 VEnhancer 工具 对 CogVdieoX 生成视频进行增强, 包括更高的帧率和更高的分辨率\n\n## 模型介绍\n\nVEnhancer 在一个统一"
},
{
"path": "CogVideo/weights/put weights here.txt",
"chars": 0,
"preview": ""
},
{
"path": "README.md",
"chars": 14605,
"preview": "## ___***3DTrajMaster: Mastering 3D Trajectory for Multi-Entity Motion in Video Generation***___\n<div align=\"center\">\n<i"
},
{
"path": "dataset/load_dataset.py",
"chars": 6322,
"preview": "# Copyright 2024 Xiao Fu, CUHK, Kuaishou Tech. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 ("
},
{
"path": "dataset/traj_vis/D_loc1_61_t3n13_003d_Hemi12_1.json",
"chars": 35277,
"preview": "{\"0\":[{\"index\":113,\"matrix\":\"[-0.00531441 0.999986 0 0] [-0.999986 -0.00531441 0 0] [0 -0 1 0] [18999.8 -22935.3 147.103"
},
{
"path": "dataset/traj_vis/Hemi12_transforms.json",
"chars": 2853,
"preview": "{\r\n \"C_01_35mm\": \"[-0.8622445326446021 -0.497817113029644 -0.09334070869305826 0] [0.49999999999999994 -0.86602540378"
},
{
"path": "dataset/traj_vis/location_data_desert.json",
"chars": 6900,
"preview": "[\r\n {\r\n \"name\": \"loc1\",\r\n \"coordinates\": {\r\n \"CameraRig_Rail\": {\r\n \"position\""
},
{
"path": "dataset/utils.py",
"chars": 9504,
"preview": "# Copyright 2024 Xiao Fu, CUHK, Kuaishou Tech. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 ("
},
{
"path": "dataset/vis_trajectory.py",
"chars": 4865,
"preview": "import trimesh\nimport numpy as np\nimport imageio\nimport copy\nimport cv2\nimport os\nfrom glob import glob\nimport open3d\nfr"
},
{
"path": "eval/GVHMR/.gitignore",
"chars": 3248,
"preview": ".vscode\n.hydra\ninputs\noutputs\n\n# All file or folders start with tmp will be ignored\ntmp* \n\n# Byte-compiled / optimized "
},
{
"path": "eval/GVHMR/.gitmodules",
"chars": 104,
"preview": "[submodule \"third-party/DPVO\"]\n\tpath = third-party/DPVO\n\turl = https://github.com/princeton-vl/DPVO.git\n"
},
{
"path": "eval/GVHMR/LICENSE",
"chars": 768,
"preview": "Copyright 2022-2023 3D Vision Group at the State Key Lab of CAD&CG, \nZhejiang University. All Rights Reserved. \n\nFor mo"
},
{
"path": "eval/GVHMR/README.md",
"chars": 3229,
"preview": "# GVHMR: World-Grounded Human Motion Recovery via Gravity-View Coordinates\n### [Project Page](https://zju3dv.github.io/g"
},
{
"path": "eval/GVHMR/docs/INSTALL.md",
"chars": 2716,
"preview": "# Install\n\n## Environment\n\n```bash\ngit clone https://github.com/zju3dv/GVHMR --recursive\ncd GVHMR\n\nconda create -y -n gv"
},
{
"path": "eval/GVHMR/download_eval_pose.sh",
"chars": 187,
"preview": "gdown https://drive.google.com/uc\\?id\\=1jMH2-ZC0ZBgtqej5Sp-E5ebBIX7mk3Xz\ngdown https://drive.google.com/uc\\?id\\=1iFcPSlc"
},
{
"path": "eval/GVHMR/eval.sh",
"chars": 132,
"preview": "python tools/demo/demo_folder.py -f eval_sets -d outputs/eval_sets_gvhmr -s\npython tools/eval_pose.py -f outputs/eval_se"
},
{
"path": "eval/GVHMR/hmr4d/__init__.py",
"chars": 204,
"preview": "import os\nfrom pathlib import Path\n\nPROJ_ROOT = Path(__file__).resolve().parents[1]\n\n\ndef os_chdir_to_proj_root():\n \""
},
{
"path": "eval/GVHMR/hmr4d/build_gvhmr.py",
"chars": 472,
"preview": "from omegaconf import OmegaConf\nfrom hmr4d import PROJ_ROOT\nfrom hydra.utils import instantiate\nfrom hmr4d.model.gvhmr.g"
},
{
"path": "eval/GVHMR/hmr4d/configs/__init__.py",
"chars": 1036,
"preview": "from dataclasses import dataclass\nfrom hydra.core.config_store import ConfigStore\nfrom hydra_zen import builds\n\nimport a"
},
{
"path": "eval/GVHMR/hmr4d/configs/data/mocap/testY.yaml",
"chars": 216,
"preview": "# definition of lightning datamodule (dataset + dataloader)\n_target_: hmr4d.datamodule.mocap_trainX_testY.DataModule\n\nda"
},
{
"path": "eval/GVHMR/hmr4d/configs/data/mocap/trainX_testY.yaml",
"chars": 314,
"preview": "# definition of lightning datamodule (dataset + dataloader)\n_target_: hmr4d.datamodule.mocap_trainX_testY.DataModule\n\nda"
},
{
"path": "eval/GVHMR/hmr4d/configs/demo.yaml",
"chars": 1271,
"preview": "defaults:\n - _self_\n - model: gvhmr/gvhmr_pl_demo\n - network: gvhmr/relative_transformer\n - endecoder: gvhmr/v1_amas"
},
{
"path": "eval/GVHMR/hmr4d/configs/exp/gvhmr/mixed/mixed.yaml",
"chars": 1856,
"preview": "# @package _global_\ndefaults:\n - override /data: mocap/trainX_testY\n - override /model: gvhmr/gvhmr_pl\n - overr"
},
{
"path": "eval/GVHMR/hmr4d/configs/global/debug/debug_train.yaml",
"chars": 393,
"preview": "# @package _global_\n\ndata_name: debug\nexp_name: debug\n\n# data:\n# limit_each_trainset: 40\n# loader_opts:\n# train:"
},
{
"path": "eval/GVHMR/hmr4d/configs/global/debug/debug_train_limit_data.yaml",
"chars": 349,
"preview": "# @package _global_\n\ndata_name: debug\nexp_name: debug\n\ndata:\n limit_each_trainset: 40\n loader_opts:\n train:\n b"
},
{
"path": "eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw.yaml",
"chars": 331,
"preview": "# @package _global_\ndefaults:\n - override /data: mocap/testY\n - override /test_datasets:\n - 3dpw/fliptest\n - ove"
},
{
"path": "eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw_emdb_rich.yaml",
"chars": 462,
"preview": "# @package _global_\ndefaults:\n - override /data: mocap/testY\n - override /test_datasets:\n - rich/all\n - emdb"
},
{
"path": "eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_emdb.yaml",
"chars": 383,
"preview": "# @package _global_\ndefaults:\n - override /data: mocap/testY\n - override /test_datasets:\n - emdb1/v1_fliptest\n "
},
{
"path": "eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_rich.yaml",
"chars": 326,
"preview": "# @package _global_\ndefaults:\n - override /data: mocap/testY\n - override /test_datasets:\n - rich/all\n - override"
},
{
"path": "eval/GVHMR/hmr4d/configs/hydra/default.yaml",
"chars": 476,
"preview": "# enable color logging\ndefaults:\n - override hydra_logging: colorlog\n - override job_logging: colorlog\n\njob_logging:\n "
},
{
"path": "eval/GVHMR/hmr4d/configs/siga24_release.yaml",
"chars": 840,
"preview": "pipeline:\n _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline\n args_denoiser3d: ${network}\n args:\n endec"
},
{
"path": "eval/GVHMR/hmr4d/configs/store_gvhmr.py",
"chars": 900,
"preview": "# Dataset\nimport hmr4d.dataset.pure_motion.amass\nimport hmr4d.dataset.emdb.emdb_motion_test\nimport hmr4d.dataset.rich.ri"
},
{
"path": "eval/GVHMR/hmr4d/configs/train.yaml",
"chars": 1210,
"preview": "# ================================ #\n# override #\n# ================================ #\n# specify"
},
{
"path": "eval/GVHMR/hmr4d/datamodule/mocap_trainX_testY.py",
"chars": 5502,
"preview": "import pytorch_lightning as pl\nfrom pytorch_lightning.utilities.combined_loader import CombinedLoader\nfrom hydra.utils i"
},
{
"path": "eval/GVHMR/hmr4d/dataset/bedlam/bedlam.py",
"chars": 11608,
"preview": "from pathlib import Path\nimport numpy as np\nimport torch\nfrom hmr4d.utils.pylogger import Log\nfrom pytorch3d.transforms "
},
{
"path": "eval/GVHMR/hmr4d/dataset/bedlam/utils.py",
"chars": 1546,
"preview": "import torch\nimport numpy as np\nfrom pathlib import Path\n\nresource_dir = Path(__file__).parent / \"resource\"\n\n\ndef mid2vn"
},
{
"path": "eval/GVHMR/hmr4d/dataset/emdb/emdb_motion_test.py",
"chars": 5954,
"preview": "from pathlib import Path\nimport numpy as np\nimport torch\nfrom torch.utils import data\nfrom hmr4d.utils.pylogger import L"
},
{
"path": "eval/GVHMR/hmr4d/dataset/emdb/utils.py",
"chars": 5538,
"preview": "import torch\nimport pickle\nimport numpy as np\nfrom pathlib import Path\nfrom tqdm import tqdm\nfrom hmr4d.utils.geo_transf"
},
{
"path": "eval/GVHMR/hmr4d/dataset/h36m/camera-parameters.json",
"chars": 32041,
"preview": "{\n \"intrinsics\": {\n \"54138969\": {\n \"calibration_matrix\": [\n [\n 1145.04940458804,\n "
},
{
"path": "eval/GVHMR/hmr4d/dataset/h36m/h36m.py",
"chars": 8424,
"preview": "import torch\nimport numpy as np\nfrom pathlib import Path\nfrom hmr4d.configs import MainStore, builds\n\nfrom hmr4d.utils.p"
},
{
"path": "eval/GVHMR/hmr4d/dataset/h36m/utils.py",
"chars": 2581,
"preview": "import json\nimport numpy as np\nfrom pathlib import Path\nfrom collections import defaultdict\nimport pickle\nimport torch\n\n"
},
{
"path": "eval/GVHMR/hmr4d/dataset/imgfeat_motion/base_dataset.py",
"chars": 745,
"preview": "import torch\nfrom torch.utils import data\nimport numpy as np\nfrom pathlib import Path\nfrom hmr4d.utils.pylogger import L"
},
{
"path": "eval/GVHMR/hmr4d/dataset/pure_motion/amass.py",
"chars": 4940,
"preview": "import torch\nimport torch.nn.functional as F\nimport numpy as np\n\nfrom tqdm import tqdm\nfrom pathlib import Path\nfrom hmr"
},
{
"path": "eval/GVHMR/hmr4d/dataset/pure_motion/base_dataset.py",
"chars": 7687,
"preview": "import torch\nfrom torch.utils.data import Dataset\nfrom pathlib import Path\n\nfrom .utils import *\nfrom .cam_traj_utils im"
},
{
"path": "eval/GVHMR/hmr4d/dataset/pure_motion/cam_traj_utils.py",
"chars": 17302,
"preview": "import torch\nimport torch.nn.functional as F\nimport numpy as np\nfrom numpy.random import rand, randn\nfrom pytorch3d.tran"
},
{
"path": "eval/GVHMR/hmr4d/dataset/pure_motion/utils.py",
"chars": 2365,
"preview": "import torch\nimport torch.nn.functional as F\nfrom pytorch3d.transforms import (\n axis_angle_to_matrix,\n matrix_to_"
},
{
"path": "eval/GVHMR/hmr4d/dataset/rich/resource/seqname2imgrange.json",
"chars": 5359,
"preview": "{\"ParkingLot1_002_burpee3\": [1, 351], \"ParkingLot1_002_overfence1\": [1, 268], \"ParkingLot1_002_overfence2\": [1, 270], \"P"
},
{
"path": "eval/GVHMR/hmr4d/dataset/rich/resource/test.txt",
"chars": 3871,
"preview": "sequence_name\tcapture_name\tscan_name\tid\tmoving_cam\tgender\tscene\taction/scene-interaction\tsubjects\tview_id\nParkingLot2_01"
},
{
"path": "eval/GVHMR/hmr4d/dataset/rich/resource/train.txt",
"chars": 4985,
"preview": "sequence_name\tcapture_name\tscan_name\tid\tmoving_cam\tgender\tview_id\nParkingLot1_002_burpee3\tParkingLot1\tscan_camcoord\t002\t"
},
{
"path": "eval/GVHMR/hmr4d/dataset/rich/resource/val.txt",
"chars": 2459,
"preview": "sequence_name\tcapture_name\tscan_name\tid\tmoving_cam\tgender\tscene\taction/scene-interaction\tsubjects\tview_id\nParkingLot1_00"
},
{
"path": "eval/GVHMR/hmr4d/dataset/rich/resource/w2az_sahmr.json",
"chars": 2519,
"preview": "{\"BBQ_scan_camcoord\": [[0.9989829107564298, 0.03367618890797693, -0.029984301180211045, 0.0008183751635392625], [0.03414"
},
{
"path": "eval/GVHMR/hmr4d/dataset/rich/rich_motion_test.py",
"chars": 6979,
"preview": "from pathlib import Path\nimport numpy as np\nimport torch\nfrom torch.utils import data\nfrom hmr4d.utils.pylogger import L"
},
{
"path": "eval/GVHMR/hmr4d/dataset/rich/rich_utils.py",
"chars": 12785,
"preview": "import torch\nimport cv2\nimport numpy as np\nfrom hmr4d.utils.geo_transform import apply_T_on_points, project_p2d\nfrom pat"
},
{
"path": "eval/GVHMR/hmr4d/dataset/threedpw/threedpw_motion_test.py",
"chars": 5728,
"preview": "import torch\nfrom torch.utils import data\nfrom pathlib import Path\n\nfrom hmr4d.utils.pylogger import Log\nfrom hmr4d.util"
},
{
"path": "eval/GVHMR/hmr4d/dataset/threedpw/threedpw_motion_train.py",
"chars": 7152,
"preview": "import torch\nfrom torch.utils import data\nfrom pathlib import Path\nimport numpy as np\n\nfrom hmr4d.utils.pylogger import "
},
{
"path": "eval/GVHMR/hmr4d/dataset/threedpw/utils.py",
"chars": 2803,
"preview": "import json\nimport numpy as np\nfrom pathlib import Path\nfrom collections import defaultdict\nimport pickle\nimport torch\ni"
},
{
"path": "eval/GVHMR/hmr4d/model/common_utils/optimizer.py",
"chars": 688,
"preview": "from torch.optim import AdamW, Adam\nfrom hmr4d.configs import MainStore, builds\n\n\noptimizer_cfgs = {\n \"adam_1e-3\": bu"
},
{
"path": "eval/GVHMR/hmr4d/model/common_utils/scheduler.py",
"chars": 1323,
"preview": "import torch\nfrom bisect import bisect_right\n\n\nclass WarmupMultiStepLR(torch.optim.lr_scheduler.LRScheduler):\n def __"
},
{
"path": "eval/GVHMR/hmr4d/model/common_utils/scheduler_cfg.py",
"chars": 1646,
"preview": "from omegaconf import DictConfig, ListConfig\nfrom hmr4d.configs import MainStore, builds\n\n# do not perform scheduling\nde"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/callbacks/metric_3dpw.py",
"chars": 8465,
"preview": "import torch\nimport pytorch_lightning as pl\nimport numpy as np\nfrom pathlib import Path\nfrom einops import einsum, rearr"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/callbacks/metric_emdb.py",
"chars": 15100,
"preview": "import torch\nimport torch.nn.functional as F\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/callbacks/metric_rich.py",
"chars": 18641,
"preview": "import torch\nimport torch.nn.functional as F\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/gvhmr_pl.py",
"chars": 14415,
"preview": "from typing import Any, Dict\nimport numpy as np\nfrom pathlib import Path\nimport torch\nimport pytorch_lightning as pl\nfro"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/gvhmr_pl_demo.py",
"chars": 2211,
"preview": "import torch\nimport pytorch_lightning as pl\nfrom hydra.utils import instantiate\nfrom hmr4d.utils.pylogger import Log\nfro"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/pipeline/gvhmr_pipeline.py",
"chars": 16191,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.cuda.amp import autocast\nimport numpy as n"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/utils/endecoder.py",
"chars": 7951,
"preview": "import torch\nimport torch.nn as nn\nfrom pytorch3d.transforms import (\n rotation_6d_to_matrix,\n matrix_to_axis_angl"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/utils/postprocess.py",
"chars": 7931,
"preview": "import torch\nfrom torch.cuda.amp import autocast\nfrom pytorch3d.transforms import (\n matrix_to_rotation_6d,\n rotat"
},
{
"path": "eval/GVHMR/hmr4d/model/gvhmr/utils/stats_compose.py",
"chars": 42887,
"preview": "# fmt:off\nbody_pose_r6d = {\n \"bedlam\": {\n \"count\": 5417929,\n \"mean\": [ 0.9772, -0.0925, 0.0028, 0.105"
},
{
"path": "eval/GVHMR/hmr4d/network/base_arch/embeddings/rotary_embedding.py",
"chars": 2197,
"preview": "import torch\nimport torch.nn as nn\nfrom einops import repeat, rearrange\nfrom torch.cuda.amp import autocast\n\n\ndef rotate"
},
{
"path": "eval/GVHMR/hmr4d/network/base_arch/transformer/encoder_rope.py",
"chars": 3595,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport math\nfrom timm.models.vision_transformer impor"
},
{
"path": "eval/GVHMR/hmr4d/network/base_arch/transformer/layer.py",
"chars": 244,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\ndef zero_module(module):\n \"\"\"\n Zero out the p"
},
{
"path": "eval/GVHMR/hmr4d/network/gvhmr/relative_transformer.py",
"chars": 6865,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom einops import einsum, rearrange, repeat\nfrom hmr"
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/__init__.py",
"chars": 1276,
"preview": "import torch\nfrom .hmr2 import HMR2\nfrom pathlib import Path\nfrom .configs import get_config\nfrom hmr4d import PROJ_ROOT"
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/components/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/components/pose_transformer.py",
"chars": 11299,
"preview": "from inspect import isfunction\nfrom typing import Callable, Optional\n\nimport torch\nfrom einops import rearrange\nfrom ein"
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/components/t_cond_mlp.py",
"chars": 6791,
"preview": "import copy\nfrom typing import List, Optional\n\nimport torch\n\n\nclass AdaptiveLayerNorm1D(torch.nn.Module):\n def __init"
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/configs/__init__.py",
"chars": 3301,
"preview": "import os\nfrom typing import Dict\nfrom yacs.config import CfgNode as CN\nfrom pathlib import Path\n\n# CACHE_DIR = os.path."
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/configs/model_config.yaml",
"chars": 2592,
"preview": "task_name: train\ntags:\n- dev\ntrain: true\ntest: false\nckpt_path: null\nseed: null\nDATASETS:\n TRAIN:\n H36M-TRAIN:\n "
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/hmr2.py",
"chars": 1944,
"preview": "import torch\nimport pytorch_lightning as pl\nfrom yacs.config import CfgNode\nfrom .vit import ViT\nfrom .smpl_head import "
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/smpl_head.py",
"chars": 4780,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport numpy as np\nimport einops\n\nfrom .utils.geometr"
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/utils/geometry.py",
"chars": 4248,
"preview": "from typing import Optional\nimport torch\nfrom torch.nn import functional as F\n\n\ndef aa_to_rotmat(theta: torch.Tensor):\n "
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/utils/preproc.py",
"chars": 1485,
"preview": "import cv2\nimport numpy as np\nimport torch\nfrom pathlib import Path\n\nIMAGE_MEAN = torch.tensor([0.485, 0.456, 0.406])\nIM"
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/utils/smpl_wrapper.py",
"chars": 1940,
"preview": "import torch\nimport numpy as np\nimport pickle\nfrom typing import Optional\nimport smplx\nfrom smplx.lbs import vertices2jo"
},
{
"path": "eval/GVHMR/hmr4d/network/hmr2/vit.py",
"chars": 12450,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\n\nimport torch\nfrom functools import partial\nimport torch.nn "
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/README.md",
"chars": 69,
"preview": "# README\n\nContents of this folder are modified from HuMoR repository."
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/__init__.py",
"chars": 124,
"preview": "from .body_model import BodyModel\nfrom .body_model_smplh import BodyModelSMPLH\nfrom .body_model_smplx import BodyModelSM"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/body_model.py",
"chars": 4663,
"preview": "from turtle import forward\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\n\nfrom smplx import SMPL, SMPLH, SMPLX\n"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/body_model_smplh.py",
"chars": 3520,
"preview": "import torch\nimport torch.nn as nn\nimport smplx\n\nkwargs_disable_member_var = {\n \"create_body_pose\": False,\n \"creat"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/body_model_smplx.py",
"chars": 4750,
"preview": "import torch\nimport torch.nn as nn\nimport smplx\n\nkwargs_disable_member_var = {\n \"create_body_pose\": False,\n \"creat"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/min_lbs.py",
"chars": 4491,
"preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom pytorch3d.transforms import a"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/smpl_lite.py",
"chars": 5571,
"preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom pathlib import Path\nfrom pyto"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/smpl_vert_segmentation.json",
"chars": 110266,
"preview": "{\n \"rightHand\": [\n 5442, \n 5443, \n 5444, \n 5445, \n 5446, \n 5447, \n 5"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/smplx_lite.py",
"chars": 13414,
"preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom pathlib import Path\nfrom pyto"
},
{
"path": "eval/GVHMR/hmr4d/utils/body_model/utils.py",
"chars": 9337,
"preview": "import os\nimport numpy as np\nimport torch\n\nSMPLH_JOINT_NAMES = [\n 'pelvis',\n 'left_hip',\n 'right_hip',\n 'spi"
},
{
"path": "eval/GVHMR/hmr4d/utils/callbacks/lr_monitor.py",
"chars": 197,
"preview": "from pytorch_lightning.callbacks import LearningRateMonitor\nfrom hmr4d.configs import builds, MainStore\n\n\nMainStore.stor"
},
{
"path": "eval/GVHMR/hmr4d/utils/callbacks/prog_bar.py",
"chars": 17404,
"preview": "from collections import OrderedDict\nfrom numbers import Number\nfrom datetime import datetime, timedelta\nfrom typing impo"
},
{
"path": "eval/GVHMR/hmr4d/utils/callbacks/simple_ckpt_saver.py",
"chars": 3883,
"preview": "from pathlib import Path\nimport torch\nimport pytorch_lightning as pl\nfrom pytorch_lightning.callbacks.checkpoint import "
},
{
"path": "eval/GVHMR/hmr4d/utils/callbacks/train_speed_timer.py",
"chars": 2666,
"preview": "import pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_only\nfrom time import time\nfrom collect"
},
{
"path": "eval/GVHMR/hmr4d/utils/comm/gather.py",
"chars": 7704,
"preview": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\"\"\"\n[Copied from detectron2]\nThis file contains p"
},
{
"path": "eval/GVHMR/hmr4d/utils/eval/eval_utils.py",
"chars": 14451,
"preview": "import torch\nimport numpy as np\n\n\n@torch.no_grad()\ndef compute_camcoord_metrics(batch, pelvis_idxs=[1, 2], fps=30, mask="
},
{
"path": "eval/GVHMR/hmr4d/utils/geo/augment_noisy_pose.py",
"chars": 7429,
"preview": "import torch\nfrom pytorch3d.transforms import axis_angle_to_matrix, matrix_to_axis_angle, matrix_to_rotation_6d\nimport h"
},
{
"path": "eval/GVHMR/hmr4d/utils/geo/flip_utils.py",
"chars": 2798,
"preview": "import torch\nfrom pytorch3d.transforms import axis_angle_to_matrix, matrix_to_axis_angle\n\n\ndef flip_heatmap_coco17(outpu"
},
{
"path": "eval/GVHMR/hmr4d/utils/geo/hmr_cam.py",
"chars": 13165,
"preview": "import torch\nimport numpy as np\nfrom hmr4d.utils.geo_transform import project_p2d, convert_bbx_xys_to_lurb, cvt_to_bi01_"
},
{
"path": "eval/GVHMR/hmr4d/utils/geo/hmr_global.py",
"chars": 13647,
"preview": "import torch\nfrom pytorch3d.transforms import axis_angle_to_matrix, matrix_to_axis_angle, matrix_to_quaternion, quaterni"
},
{
"path": "eval/GVHMR/hmr4d/utils/geo/quaternion.py",
"chars": 13209,
"preview": "# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
},
{
"path": "eval/GVHMR/hmr4d/utils/geo/transforms.py",
"chars": 713,
"preview": "import torch\n\n\ndef axis_rotate_to_matrix(angle, axis=\"x\"):\n \"\"\"Get rotation matrix for rotating around one axis\n A"
},
{
"path": "eval/GVHMR/hmr4d/utils/geo_transform.py",
"chars": 20723,
"preview": "import numpy as np\nimport cv2\nimport torch\nimport torch.nn.functional as F\nfrom pytorch3d.transforms import so3_exp_map,"
},
{
"path": "eval/GVHMR/hmr4d/utils/ik/ccd_ik.py",
"chars": 5739,
"preview": "# Sebastian IK\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom einops import einsum, rearrange, "
},
{
"path": "eval/GVHMR/hmr4d/utils/kpts/kp2d_utils.py",
"chars": 14207,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\nimport cv2\nimport numpy as np\n\n# expose _taylor to outsi"
},
{
"path": "eval/GVHMR/hmr4d/utils/matrix.py",
"chars": 45885,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport copy\nfrom typing import List, Optional\n\nimport"
},
{
"path": "eval/GVHMR/hmr4d/utils/net_utils.py",
"chars": 5935,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom pathlib import Path\nfrom hmr4d.utils.pylogger im"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/__init__.py",
"chars": 250,
"preview": "try:\n from hmr4d.utils.preproc.tracker import Tracker\n from hmr4d.utils.preproc.vitfeat_extractor import Extractor"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/slam.py",
"chars": 3037,
"preview": "import cv2\nimport time\nimport torch\nfrom multiprocessing import Process, Queue\n\ntry:\n from dpvo.utils import Timer\n "
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/tracker.py",
"chars": 3830,
"preview": "from ultralytics import YOLO\nfrom hmr4d import PROJ_ROOT\n\nimport torch\nimport numpy as np\nfrom tqdm import tqdm\nfrom col"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitfeat_extractor.py",
"chars": 3061,
"preview": "import torch\nfrom hmr4d.network.hmr2 import load_hmr2, HMR2\n\n\nfrom hmr4d.utils.video_io_utils import read_video_np\nimpor"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose.py",
"chars": 5460,
"preview": "import torch\nimport torch.nn.functional as F\nimport numpy as np\nfrom .vitpose_pytorch import build_model\nfrom .vitfeat_e"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/__init__.py",
"chars": 57,
"preview": "from .src.vitpose_infer.model_builder import build_model\n"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/__init__.py",
"chars": 1272,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\n# from .alexnet import AlexNet\n# from .cpm import CPM\n# from .hourglass "
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/alexnet.py",
"chars": 1868,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch.nn as nn\n\nfrom ..builder import BACKBONES\nfrom .base_backbo"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/cpm.py",
"chars": 6374,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport torch\nimport torch.nn as nn\nfrom mmcv.cnn import Con"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/hourglass.py",
"chars": 6946,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, cons"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/hourglass_ae.py",
"chars": 7048,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, MaxP"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/hrformer.py",
"chars": 29872,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\n\nimport math\n\nimport torch\nimport torch.nn as nn\n# from timm.models.laye"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/litehrnet.py",
"chars": 35843,
"preview": "# ------------------------------------------------------------------------------\n# Adapted from https://github.com/HRNet"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/mobilenet_v2.py",
"chars": 9995,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport logging\n\nimport torch.nn as nn\nimport torch.utils.che"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/mobilenet_v3.py",
"chars": 7271,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport logging\n\nimport torch.nn as nn\nfrom mmcv.cnn import C"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/mspn.py",
"chars": 18464,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy as cp\nfrom collections import OrderedDict\n\nimport torch.nn a"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/regnet.py",
"chars": 12005,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport numpy as np\nimport torch.nn as nn\nfrom mmcv.cnn impo"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/resnest.py",
"chars": 12124,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimpor"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/resnext.py",
"chars": 6659,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom mmcv.cnn import build_conv_layer, build_norm_layer\n\nfrom ..builder "
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/rsn.py",
"chars": 21759,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy as cp\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.fu"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/scnet.py",
"chars": 8057,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.function"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/seresnet.py",
"chars": 4602,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch.utils.checkpoint as cp\n\nfrom ..builder import BACKBONES\nfro"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/seresnext.py",
"chars": 7039,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom mmcv.cnn import build_conv_layer, build_norm_layer\n\nfrom ..builder "
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/shufflenet_v1.py",
"chars": 11942,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport logging\n\nimport torch\nimport torch.nn as nn\nimport to"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/shufflenet_v2.py",
"chars": 10666,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport logging\n\nimport torch\nimport torch.nn as nn\nimport to"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/tcn.py",
"chars": 10003,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, buil"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/test_torch.py",
"chars": 1561,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Net(nn.Module):\n\n def __init__(self):\n "
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/__init__.py",
"chars": 358,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .channel_shuffle import channel_shuffle\nfrom .inverted_residual imp"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/channel_shuffle.py",
"chars": 889,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\n\ndef channel_shuffle(x, groups):\n \"\"\"Channel Shuffle op"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/inverted_residual.py",
"chars": 4196,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport torch.nn as nn\nimport torch.utils.checkpoint as cp\nf"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/make_divisible.py",
"chars": 1056,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\ndef make_divisible(value, divisor, min_value=None, min_ratio=0.9):\n \""
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/se_layer.py",
"chars": 1983,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport mmcv\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\n\nclas"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/utils.py",
"chars": 2845,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom collections import OrderedDict\n\nfrom mmcv.runner.checkpoint import "
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vgg.py",
"chars": 7165,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, constant_init, ka"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vipnas_mbv3.py",
"chars": 6586,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport logging\n\nimport torch.nn as nn\nfrom mmcv.cnn import C"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vipnas_resnet.py",
"chars": 21737,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport torch.nn as nn\nimport torch.utils.checkpoint as cp\nf"
},
{
"path": "eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/vit.py",
"chars": 11235,
"preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\n\nimport torch\nfrom functools import partial\nimport torch.nn "
}
]
// ... and 76 more files (download for full content)
About this extraction
This page contains the full source code of the KwaiVGI/3DTrajMaster GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 276 files (6.9 MB), approximately 1.8M tokens, and a symbol index with 1654 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.