Showing preview only (4,264K chars total). Download the full file or copy to clipboard to get everything.
Repository: modelscope/DiffSynth-Studio
Branch: main
Commit: 078fc551d924
Files: 841
Total size: 3.9 MB
Directory structure:
gitextract_1d_bzl_w/
├── .github/
│ └── workflows/
│ └── publish.yaml
├── .gitignore
├── LICENSE
├── README.md
├── README_zh.md
├── diffsynth/
│ ├── __init__.py
│ ├── configs/
│ │ ├── __init__.py
│ │ ├── model_configs.py
│ │ └── vram_management_module_maps.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── attention/
│ │ │ ├── __init__.py
│ │ │ └── attention.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── operators.py
│ │ │ └── unified_dataset.py
│ │ ├── device/
│ │ │ ├── __init__.py
│ │ │ └── npu_compatible_device.py
│ │ ├── gradient/
│ │ │ ├── __init__.py
│ │ │ └── gradient_checkpoint.py
│ │ ├── loader/
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ ├── file.py
│ │ │ └── model.py
│ │ ├── npu_patch/
│ │ │ └── npu_fused_operator.py
│ │ └── vram/
│ │ ├── __init__.py
│ │ ├── disk_map.py
│ │ ├── initialization.py
│ │ └── layers.py
│ ├── diffusion/
│ │ ├── __init__.py
│ │ ├── base_pipeline.py
│ │ ├── flow_match.py
│ │ ├── logger.py
│ │ ├── loss.py
│ │ ├── parsers.py
│ │ ├── runner.py
│ │ └── training_module.py
│ ├── models/
│ │ ├── anima_dit.py
│ │ ├── dinov3_image_encoder.py
│ │ ├── flux2_dit.py
│ │ ├── flux2_text_encoder.py
│ │ ├── flux2_vae.py
│ │ ├── flux_controlnet.py
│ │ ├── flux_dit.py
│ │ ├── flux_infiniteyou.py
│ │ ├── flux_ipadapter.py
│ │ ├── flux_lora_encoder.py
│ │ ├── flux_lora_patcher.py
│ │ ├── flux_text_encoder_clip.py
│ │ ├── flux_text_encoder_t5.py
│ │ ├── flux_vae.py
│ │ ├── flux_value_control.py
│ │ ├── general_modules.py
│ │ ├── longcat_video_dit.py
│ │ ├── ltx2_audio_vae.py
│ │ ├── ltx2_common.py
│ │ ├── ltx2_dit.py
│ │ ├── ltx2_text_encoder.py
│ │ ├── ltx2_upsampler.py
│ │ ├── ltx2_video_vae.py
│ │ ├── model_loader.py
│ │ ├── mova_audio_dit.py
│ │ ├── mova_audio_vae.py
│ │ ├── mova_dual_tower_bridge.py
│ │ ├── nexus_gen.py
│ │ ├── nexus_gen_ar_model.py
│ │ ├── nexus_gen_projector.py
│ │ ├── qwen_image_controlnet.py
│ │ ├── qwen_image_dit.py
│ │ ├── qwen_image_image2lora.py
│ │ ├── qwen_image_text_encoder.py
│ │ ├── qwen_image_vae.py
│ │ ├── sd_text_encoder.py
│ │ ├── siglip2_image_encoder.py
│ │ ├── step1x_connector.py
│ │ ├── step1x_text_encoder.py
│ │ ├── wan_video_animate_adapter.py
│ │ ├── wan_video_camera_controller.py
│ │ ├── wan_video_dit.py
│ │ ├── wan_video_dit_s2v.py
│ │ ├── wan_video_image_encoder.py
│ │ ├── wan_video_mot.py
│ │ ├── wan_video_motion_controller.py
│ │ ├── wan_video_text_encoder.py
│ │ ├── wan_video_vace.py
│ │ ├── wan_video_vae.py
│ │ ├── wantodance.py
│ │ ├── wav2vec.py
│ │ ├── z_image_controlnet.py
│ │ ├── z_image_dit.py
│ │ ├── z_image_image2lora.py
│ │ └── z_image_text_encoder.py
│ ├── pipelines/
│ │ ├── anima_image.py
│ │ ├── flux2_image.py
│ │ ├── flux_image.py
│ │ ├── ltx2_audio_video.py
│ │ ├── mova_audio_video.py
│ │ ├── qwen_image.py
│ │ ├── wan_video.py
│ │ └── z_image.py
│ ├── utils/
│ │ ├── controlnet/
│ │ │ ├── __init__.py
│ │ │ ├── annotator.py
│ │ │ └── controlnet_input.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── audio.py
│ │ │ ├── audio_video.py
│ │ │ └── media_io_ltx2.py
│ │ ├── lora/
│ │ │ ├── __init__.py
│ │ │ ├── flux.py
│ │ │ ├── general.py
│ │ │ ├── merge.py
│ │ │ └── reset_rank.py
│ │ ├── ses/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── ses.py
│ │ ├── state_dict_converters/
│ │ │ ├── __init__.py
│ │ │ ├── anima_dit.py
│ │ │ ├── flux2_text_encoder.py
│ │ │ ├── flux_controlnet.py
│ │ │ ├── flux_dit.py
│ │ │ ├── flux_infiniteyou.py
│ │ │ ├── flux_ipadapter.py
│ │ │ ├── flux_text_encoder_clip.py
│ │ │ ├── flux_text_encoder_t5.py
│ │ │ ├── flux_vae.py
│ │ │ ├── ltx2_audio_vae.py
│ │ │ ├── ltx2_dit.py
│ │ │ ├── ltx2_text_encoder.py
│ │ │ ├── ltx2_video_vae.py
│ │ │ ├── nexus_gen.py
│ │ │ ├── nexus_gen_projector.py
│ │ │ ├── qwen_image_text_encoder.py
│ │ │ ├── step1x_connector.py
│ │ │ ├── wan_video_animate_adapter.py
│ │ │ ├── wan_video_dit.py
│ │ │ ├── wan_video_image_encoder.py
│ │ │ ├── wan_video_mot.py
│ │ │ ├── wan_video_vace.py
│ │ │ ├── wan_video_vae.py
│ │ │ ├── wans2v_audio_encoder.py
│ │ │ └── z_image_text_encoder.py
│ │ └── xfuser/
│ │ ├── __init__.py
│ │ └── xdit_context_parallel.py
│ └── version.py
├── docs/
│ ├── en/
│ │ ├── .readthedocs.yaml
│ │ ├── API_Reference/
│ │ │ └── core/
│ │ │ ├── attention.md
│ │ │ ├── data.md
│ │ │ ├── gradient.md
│ │ │ ├── loader.md
│ │ │ └── vram.md
│ │ ├── Developer_Guide/
│ │ │ ├── Building_a_Pipeline.md
│ │ │ ├── Enabling_VRAM_management.md
│ │ │ ├── Integrating_Your_Model.md
│ │ │ └── Training_Diffusion_Models.md
│ │ ├── Makefile
│ │ ├── Model_Details/
│ │ │ ├── Anima.md
│ │ │ ├── FLUX.md
│ │ │ ├── FLUX2.md
│ │ │ ├── LTX-2.md
│ │ │ ├── Overview.md
│ │ │ ├── Qwen-Image.md
│ │ │ ├── Wan.md
│ │ │ └── Z-Image.md
│ │ ├── Pipeline_Usage/
│ │ │ ├── Environment_Variables.md
│ │ │ ├── GPU_support.md
│ │ │ ├── Model_Inference.md
│ │ │ ├── Model_Training.md
│ │ │ ├── Setup.md
│ │ │ └── VRAM_management.md
│ │ ├── QA.md
│ │ ├── README.md
│ │ ├── Research_Tutorial/
│ │ │ ├── inference_time_scaling.ipynb
│ │ │ ├── inference_time_scaling.md
│ │ │ ├── train_from_scratch.md
│ │ │ └── train_from_scratch.py
│ │ ├── Training/
│ │ │ ├── Differential_LoRA.md
│ │ │ ├── Direct_Distill.md
│ │ │ ├── FP8_Precision.md
│ │ │ ├── Split_Training.md
│ │ │ ├── Supervised_Fine_Tuning.md
│ │ │ └── Understanding_Diffusion_models.md
│ │ ├── conf.py
│ │ └── index.rst
│ ├── requirements.txt
│ └── zh/
│ ├── .readthedocs.yaml
│ ├── API_Reference/
│ │ └── core/
│ │ ├── attention.md
│ │ ├── data.md
│ │ ├── gradient.md
│ │ ├── loader.md
│ │ └── vram.md
│ ├── Developer_Guide/
│ │ ├── Building_a_Pipeline.md
│ │ ├── Enabling_VRAM_management.md
│ │ ├── Integrating_Your_Model.md
│ │ └── Training_Diffusion_Models.md
│ ├── Makefile
│ ├── Model_Details/
│ │ ├── Anima.md
│ │ ├── FLUX.md
│ │ ├── FLUX2.md
│ │ ├── LTX-2.md
│ │ ├── Overview.md
│ │ ├── Qwen-Image.md
│ │ ├── Wan.md
│ │ └── Z-Image.md
│ ├── Pipeline_Usage/
│ │ ├── Environment_Variables.md
│ │ ├── GPU_support.md
│ │ ├── Model_Inference.md
│ │ ├── Model_Training.md
│ │ ├── Setup.md
│ │ └── VRAM_management.md
│ ├── QA.md
│ ├── README.md
│ ├── Research_Tutorial/
│ │ ├── inference_time_scaling.ipynb
│ │ ├── inference_time_scaling.md
│ │ ├── train_from_scratch.md
│ │ └── train_from_scratch.py
│ ├── Training/
│ │ ├── Differential_LoRA.md
│ │ ├── Direct_Distill.md
│ │ ├── FP8_Precision.md
│ │ ├── Split_Training.md
│ │ ├── Supervised_Fine_Tuning.md
│ │ └── Understanding_Diffusion_models.md
│ ├── conf.py
│ └── index.rst
├── examples/
│ ├── anima/
│ │ ├── README.md
│ │ ├── model_inference/
│ │ │ └── anima-preview.py
│ │ ├── model_inference_low_vram/
│ │ │ └── anima-preview.py
│ │ └── model_training/
│ │ ├── full/
│ │ │ └── anima-preview.sh
│ │ ├── lora/
│ │ │ └── anima-preview.sh
│ │ ├── train.py
│ │ ├── validate_full/
│ │ │ └── anima-preview.py
│ │ └── validate_lora/
│ │ └── anima-preview.py
│ ├── dev_tools/
│ │ ├── fix_path.py
│ │ └── unit_test.py
│ ├── flux/
│ │ ├── README.md
│ │ ├── model_inference/
│ │ │ ├── FLEX.2-preview.py
│ │ │ ├── FLUX.1-Kontext-dev.py
│ │ │ ├── FLUX.1-Krea-dev.py
│ │ │ ├── FLUX.1-dev-AttriCtrl.py
│ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py
│ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py
│ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.py
│ │ │ ├── FLUX.1-dev-EliGen.py
│ │ │ ├── FLUX.1-dev-IP-Adapter.py
│ │ │ ├── FLUX.1-dev-InfiniteYou.py
│ │ │ ├── FLUX.1-dev-LoRA-Encoder.py
│ │ │ ├── FLUX.1-dev-LoRA-Fusion.py
│ │ │ ├── FLUX.1-dev.py
│ │ │ ├── Nexus-Gen-Editing.py
│ │ │ ├── Nexus-Gen-Generation.py
│ │ │ └── Step1X-Edit.py
│ │ ├── model_inference_low_vram/
│ │ │ ├── FLEX.2-preview.py
│ │ │ ├── FLUX.1-Kontext-dev.py
│ │ │ ├── FLUX.1-Krea-dev.py
│ │ │ ├── FLUX.1-dev-AttriCtrl.py
│ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py
│ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py
│ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.py
│ │ │ ├── FLUX.1-dev-EliGen.py
│ │ │ ├── FLUX.1-dev-IP-Adapter.py
│ │ │ ├── FLUX.1-dev-InfiniteYou.py
│ │ │ ├── FLUX.1-dev-LoRA-Encoder.py
│ │ │ ├── FLUX.1-dev-LoRA-Fusion.py
│ │ │ ├── FLUX.1-dev.py
│ │ │ ├── Nexus-Gen-Editing.py
│ │ │ ├── Nexus-Gen-Generation.py
│ │ │ └── Step1X-Edit.py
│ │ └── model_training/
│ │ ├── full/
│ │ │ ├── FLEX.2-preview.sh
│ │ │ ├── FLUX.1-Kontext-dev.sh
│ │ │ ├── FLUX.1-Krea-dev.sh
│ │ │ ├── FLUX.1-dev-AttriCtrl.sh
│ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.sh
│ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.sh
│ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.sh
│ │ │ ├── FLUX.1-dev-IP-Adapter.sh
│ │ │ ├── FLUX.1-dev-InfiniteYou.sh
│ │ │ ├── FLUX.1-dev-LoRA-Encoder.sh
│ │ │ ├── FLUX.1-dev.sh
│ │ │ ├── Nexus-Gen.sh
│ │ │ ├── Step1X-Edit.sh
│ │ │ ├── accelerate_config.yaml
│ │ │ ├── accelerate_config_zero2offload.yaml
│ │ │ └── accelerate_config_zero3.yaml
│ │ ├── lora/
│ │ │ ├── FLEX.2-preview.sh
│ │ │ ├── FLUX.1-Kontext-dev.sh
│ │ │ ├── FLUX.1-Krea-dev.sh
│ │ │ ├── FLUX.1-dev-AttriCtrl.sh
│ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.sh
│ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.sh
│ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.sh
│ │ │ ├── FLUX.1-dev-EliGen.sh
│ │ │ ├── FLUX.1-dev-IP-Adapter.sh
│ │ │ ├── FLUX.1-dev-InfiniteYou.sh
│ │ │ ├── FLUX.1-dev.sh
│ │ │ ├── Nexus-Gen.sh
│ │ │ └── Step1X-Edit.sh
│ │ ├── special/
│ │ │ └── npu_training/
│ │ │ ├── FLUX.1-Kontext-dev-NPU.sh
│ │ │ └── FLUX.1-dev-NPU.sh
│ │ ├── train.py
│ │ ├── validate_full/
│ │ │ ├── FLEX.2-preview.py
│ │ │ ├── FLUX.1-Kontext-dev.py
│ │ │ ├── FLUX.1-Krea-dev.py
│ │ │ ├── FLUX.1-dev-AttriCtrl.py
│ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py
│ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py
│ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.py
│ │ │ ├── FLUX.1-dev-IP-Adapter.py
│ │ │ ├── FLUX.1-dev-InfiniteYou.py
│ │ │ ├── FLUX.1-dev-LoRA-Encoder.py
│ │ │ ├── FLUX.1-dev.py
│ │ │ ├── Nexus-Gen.py
│ │ │ └── Step1X-Edit.py
│ │ └── validate_lora/
│ │ ├── FLEX.2-preview.py
│ │ ├── FLUX.1-Kontext-dev.py
│ │ ├── FLUX.1-Krea-dev.py
│ │ ├── FLUX.1-dev-AttriCtrl.py
│ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py
│ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py
│ │ ├── FLUX.1-dev-Controlnet-Upscaler.py
│ │ ├── FLUX.1-dev-EliGen.py
│ │ ├── FLUX.1-dev-IP-Adapter.py
│ │ ├── FLUX.1-dev-InfiniteYou.py
│ │ ├── FLUX.1-dev.py
│ │ ├── Nexus-Gen.py
│ │ └── Step1X-Edit.py
│ ├── flux2/
│ │ ├── README.md
│ │ ├── model_inference/
│ │ │ ├── FLUX.2-dev.py
│ │ │ ├── FLUX.2-klein-4B.py
│ │ │ ├── FLUX.2-klein-9B.py
│ │ │ ├── FLUX.2-klein-base-4B.py
│ │ │ └── FLUX.2-klein-base-9B.py
│ │ ├── model_inference_low_vram/
│ │ │ ├── FLUX.2-dev.py
│ │ │ ├── FLUX.2-klein-4B.py
│ │ │ ├── FLUX.2-klein-9B.py
│ │ │ ├── FLUX.2-klein-base-4B.py
│ │ │ └── FLUX.2-klein-base-9B.py
│ │ └── model_training/
│ │ ├── full/
│ │ │ ├── FLUX.2-klein-4B.sh
│ │ │ ├── FLUX.2-klein-9B.sh
│ │ │ ├── FLUX.2-klein-base-4B.sh
│ │ │ ├── FLUX.2-klein-base-9B.sh
│ │ │ ├── accelerate_config.yaml
│ │ │ └── accelerate_config_zero3.yaml
│ │ ├── lora/
│ │ │ ├── FLUX.2-dev.sh
│ │ │ ├── FLUX.2-klein-4B.sh
│ │ │ ├── FLUX.2-klein-9B.sh
│ │ │ ├── FLUX.2-klein-base-4B.sh
│ │ │ └── FLUX.2-klein-base-9B.sh
│ │ ├── special/
│ │ │ └── npu_training/
│ │ │ ├── FLUX.2-dev-Lora-NPU.sh
│ │ │ └── FLUX.2-klein-9B-NPU.sh
│ │ ├── train.py
│ │ ├── validate_full/
│ │ │ ├── FLUX.2-klein-4B.py
│ │ │ ├── FLUX.2-klein-9B.py
│ │ │ ├── FLUX.2-klein-base-4B.py
│ │ │ └── FLUX.2-klein-base-9B.py
│ │ └── validate_lora/
│ │ ├── FLUX.2-dev.py
│ │ ├── FLUX.2-klein-4B.py
│ │ ├── FLUX.2-klein-9B.py
│ │ ├── FLUX.2-klein-base-4B.py
│ │ └── FLUX.2-klein-base-9B.py
│ ├── ltx2/
│ │ ├── README.md
│ │ ├── model_inference/
│ │ │ ├── LTX-2-I2AV-DistilledPipeline.py
│ │ │ ├── LTX-2-I2AV-OneStage.py
│ │ │ ├── LTX-2-I2AV-TwoStage.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-In.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Left.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Out.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Right.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Down.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Up.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Static.py
│ │ │ ├── LTX-2-T2AV-DistilledPipeline.py
│ │ │ ├── LTX-2-T2AV-IC-LoRA-Detailer.py
│ │ │ ├── LTX-2-T2AV-IC-LoRA-Union-Control.py
│ │ │ ├── LTX-2-T2AV-OneStage.py
│ │ │ ├── LTX-2-T2AV-TwoStage.py
│ │ │ ├── LTX-2.3-A2V-TwoStage.py
│ │ │ ├── LTX-2.3-I2AV-DistilledPipeline.py
│ │ │ ├── LTX-2.3-I2AV-OneStage.py
│ │ │ ├── LTX-2.3-I2AV-TwoStage.py
│ │ │ ├── LTX-2.3-T2AV-DistilledPipeline.py
│ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py
│ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Union-Control.py
│ │ │ ├── LTX-2.3-T2AV-OneStage.py
│ │ │ ├── LTX-2.3-T2AV-TwoStage-Retake.py
│ │ │ └── LTX-2.3-T2AV-TwoStage.py
│ │ ├── model_inference_low_vram/
│ │ │ ├── LTX-2-I2AV-DistilledPipeline.py
│ │ │ ├── LTX-2-I2AV-OneStage.py
│ │ │ ├── LTX-2-I2AV-TwoStage.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-In.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Left.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Out.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Right.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Down.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Up.py
│ │ │ ├── LTX-2-T2AV-Camera-Control-Static.py
│ │ │ ├── LTX-2-T2AV-DistilledPipeline.py
│ │ │ ├── LTX-2-T2AV-IC-LoRA-Detailer.py
│ │ │ ├── LTX-2-T2AV-IC-LoRA-Union-Control.py
│ │ │ ├── LTX-2-T2AV-OneStage.py
│ │ │ ├── LTX-2-T2AV-TwoStage.py
│ │ │ ├── LTX-2.3-A2V-TwoStage.py
│ │ │ ├── LTX-2.3-I2AV-DistilledPipeline.py
│ │ │ ├── LTX-2.3-I2AV-OneStage.py
│ │ │ ├── LTX-2.3-I2AV-TwoStage.py
│ │ │ ├── LTX-2.3-T2AV-DistilledPipeline.py
│ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py
│ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Union-Control.py
│ │ │ ├── LTX-2.3-T2AV-OneStage.py
│ │ │ ├── LTX-2.3-T2AV-TwoStage-Retake.py
│ │ │ └── LTX-2.3-T2AV-TwoStage.py
│ │ └── model_training/
│ │ ├── full/
│ │ │ ├── LTX-2-T2AV-splited.sh
│ │ │ ├── LTX-2.3-I2AV-splited.sh
│ │ │ └── LTX-2.3-T2AV-splited.sh
│ │ ├── lora/
│ │ │ ├── LTX-2-T2AV-IC-LoRA-splited.sh
│ │ │ ├── LTX-2-T2AV-noaudio.sh
│ │ │ ├── LTX-2-T2AV-splited.sh
│ │ │ ├── LTX-2.3-I2AV-splited.sh
│ │ │ ├── LTX-2.3-T2AV-IC-LoRA-splited.sh
│ │ │ └── LTX-2.3-T2AV-splited.sh
│ │ ├── scripts/
│ │ │ ├── split_model_statedicts.py
│ │ │ └── split_model_statedicts_ltx2.3.py
│ │ ├── train.py
│ │ ├── validate_full/
│ │ │ ├── LTX-2-T2AV.py
│ │ │ ├── LTX-2.3-I2AV.py
│ │ │ └── LTX-2.3-T2AV.py
│ │ └── validate_lora/
│ │ ├── LTX-2-T2AV-IC-LoRA.py
│ │ ├── LTX-2-T2AV.py
│ │ ├── LTX-2-T2AV_noaudio.py
│ │ ├── LTX-2.3-I2AV.py
│ │ ├── LTX-2.3-T2AV-IC-LoRA.py
│ │ └── LTX-2.3-T2AV.py
│ ├── mova/
│ │ ├── README.md
│ │ ├── acceleration/
│ │ │ └── unified_sequence_parallel.py
│ │ ├── model_inference/
│ │ │ ├── MOVA-360p-I2AV.py
│ │ │ └── MOVA-720p-I2AV.py
│ │ ├── model_inference_low_vram/
│ │ │ ├── MOVA-360p-I2AV.py
│ │ │ └── MOVA-720p-I2AV.py
│ │ └── model_training/
│ │ ├── full/
│ │ │ ├── MOVA-360P-I2AV.sh
│ │ │ └── MOVA-720P-I2AV.sh
│ │ ├── lora/
│ │ │ ├── MOVA-360P-I2AV.sh
│ │ │ └── MOVA-720P-I2AV.sh
│ │ ├── train.py
│ │ ├── validate_full/
│ │ │ ├── MOVA-360p-I2AV.py
│ │ │ └── MOVA-720p-I2AV.py
│ │ └── validate_lora/
│ │ ├── MOVA-360p-I2AV.py
│ │ └── MOVA-720p-I2AV.py
│ ├── qwen_image/
│ │ ├── README.md
│ │ ├── model_inference/
│ │ │ ├── FireRed-Image-Edit-1.0.py
│ │ │ ├── FireRed-Image-Edit-1.1.py
│ │ │ ├── Qwen-Image-2512.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-InpaintCanny.py
│ │ │ ├── Qwen-Image-Distill-DMD2.py
│ │ │ ├── Qwen-Image-Distill-Full.py
│ │ │ ├── Qwen-Image-Distill-LoRA.py
│ │ │ ├── Qwen-Image-Edit-2509.py
│ │ │ ├── Qwen-Image-Edit-2511-ICEdit.py
│ │ │ ├── Qwen-Image-Edit-2511-Lightning.py
│ │ │ ├── Qwen-Image-Edit-2511.py
│ │ │ ├── Qwen-Image-Edit-Lowres-Fix.py
│ │ │ ├── Qwen-Image-Edit.py
│ │ │ ├── Qwen-Image-EliGen-Poster.py
│ │ │ ├── Qwen-Image-EliGen-V2.py
│ │ │ ├── Qwen-Image-EliGen.py
│ │ │ ├── Qwen-Image-In-Context-Control-Union.py
│ │ │ ├── Qwen-Image-Layered-Control-V2.py
│ │ │ ├── Qwen-Image-Layered-Control.py
│ │ │ ├── Qwen-Image-Layered.py
│ │ │ ├── Qwen-Image-i2L.py
│ │ │ └── Qwen-Image.py
│ │ ├── model_inference_low_vram/
│ │ │ ├── FireRed-Image-Edit-1.0.py
│ │ │ ├── FireRed-Image-Edit-1.1.py
│ │ │ ├── Qwen-Image-2512.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-InpaintCanny.py
│ │ │ ├── Qwen-Image-Distill-DMD2.py
│ │ │ ├── Qwen-Image-Distill-Full.py
│ │ │ ├── Qwen-Image-Distill-LoRA.py
│ │ │ ├── Qwen-Image-Edit-2509.py
│ │ │ ├── Qwen-Image-Edit-2511-ICEdit.py
│ │ │ ├── Qwen-Image-Edit-2511-Lightning.py
│ │ │ ├── Qwen-Image-Edit-2511.py
│ │ │ ├── Qwen-Image-Edit-Lowres-Fix.py
│ │ │ ├── Qwen-Image-Edit.py
│ │ │ ├── Qwen-Image-EliGen-Poster.py
│ │ │ ├── Qwen-Image-EliGen-V2.py
│ │ │ ├── Qwen-Image-EliGen.py
│ │ │ ├── Qwen-Image-In-Context-Control-Union.py
│ │ │ ├── Qwen-Image-Layered-Control-V2.py
│ │ │ ├── Qwen-Image-Layered-Control.py
│ │ │ ├── Qwen-Image-Layered.py
│ │ │ ├── Qwen-Image-i2L.py
│ │ │ └── Qwen-Image.py
│ │ └── model_training/
│ │ ├── full/
│ │ │ ├── FireRed-Image-Edit-1.0.sh
│ │ │ ├── FireRed-Image-Edit-1.1.sh
│ │ │ ├── Qwen-Image-2512.sh
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.sh
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.sh
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.sh
│ │ │ ├── Qwen-Image-Distill-Full.sh
│ │ │ ├── Qwen-Image-Edit-2509.sh
│ │ │ ├── Qwen-Image-Edit-2511.sh
│ │ │ ├── Qwen-Image-Edit.sh
│ │ │ ├── Qwen-Image-Layered-Control.sh
│ │ │ ├── Qwen-Image-Layered.sh
│ │ │ ├── Qwen-Image.sh
│ │ │ ├── accelerate_config.yaml
│ │ │ ├── accelerate_config_zero2offload.yaml
│ │ │ └── accelerate_config_zero3.yaml
│ │ ├── lora/
│ │ │ ├── FireRed-Image-Edit-1.0.sh
│ │ │ ├── FireRed-Image-Edit-1.1.sh
│ │ │ ├── Qwen-Image-2512.sh
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.sh
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.sh
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.sh
│ │ │ ├── Qwen-Image-Distill-Full.sh
│ │ │ ├── Qwen-Image-Distill-LoRA.sh
│ │ │ ├── Qwen-Image-Edit-2509.sh
│ │ │ ├── Qwen-Image-Edit-2511.sh
│ │ │ ├── Qwen-Image-Edit.sh
│ │ │ ├── Qwen-Image-EliGen-Poster.sh
│ │ │ ├── Qwen-Image-EliGen.sh
│ │ │ ├── Qwen-Image-In-Context-Control-Union.sh
│ │ │ ├── Qwen-Image-Layered-Control-V2.sh
│ │ │ ├── Qwen-Image-Layered-Control.sh
│ │ │ ├── Qwen-Image-Layered.sh
│ │ │ └── Qwen-Image.sh
│ │ ├── scripts/
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Initialize.py
│ │ │ └── Qwen-Image-Blockwise-ControlNet-Inpaint-Initialize.py
│ │ ├── special/
│ │ │ ├── differential_training/
│ │ │ │ └── Qwen-Image-LoRA.sh
│ │ │ ├── fp8_training/
│ │ │ │ ├── Qwen-Image-LoRA.sh
│ │ │ │ └── validate.py
│ │ │ ├── low_vram_training/
│ │ │ │ ├── Qwen-Image-LoRA.sh
│ │ │ │ ├── deepspeed_zero3_cpuoffload.yaml
│ │ │ │ └── ds_z3_cpuoffload.json
│ │ │ ├── npu_training/
│ │ │ │ ├── Qwen-Image-Edit-2509-LoRA-NPU.sh
│ │ │ │ ├── Qwen-Image-Edit-2509-NPU.sh
│ │ │ │ └── Qwen-Image-LoRA-NPU.sh
│ │ │ ├── simple/
│ │ │ │ └── train.py
│ │ │ └── split_training/
│ │ │ ├── Qwen-Image-LoRA.sh
│ │ │ └── validate.py
│ │ ├── train.py
│ │ ├── validate_full/
│ │ │ ├── FireRed-Image-Edit-1.0.py
│ │ │ ├── FireRed-Image-Edit-1.1.py
│ │ │ ├── Qwen-Image-2512.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py
│ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py
│ │ │ ├── Qwen-Image-Distill-Full.py
│ │ │ ├── Qwen-Image-Edit-2509.py
│ │ │ ├── Qwen-Image-Edit-2511.py
│ │ │ ├── Qwen-Image-Edit.py
│ │ │ ├── Qwen-Image-Layered-Control.py
│ │ │ ├── Qwen-Image-Layered.py
│ │ │ └── Qwen-Image.py
│ │ └── validate_lora/
│ │ ├── FireRed-Image-Edit-1.0.py
│ │ ├── FireRed-Image-Edit-1.1.py
│ │ ├── Qwen-Image-2512.py
│ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py
│ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py
│ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py
│ │ ├── Qwen-Image-Distill-Full.py
│ │ ├── Qwen-Image-Distill-LoRA.py
│ │ ├── Qwen-Image-Edit-2509.py
│ │ ├── Qwen-Image-Edit-2511.py
│ │ ├── Qwen-Image-Edit.py
│ │ ├── Qwen-Image-EliGen-Poster.py
│ │ ├── Qwen-Image-EliGen.py
│ │ ├── Qwen-Image-In-Context-Control-Union.py
│ │ ├── Qwen-Image-Layered-Control-V2.py
│ │ ├── Qwen-Image-Layered-Control.py
│ │ ├── Qwen-Image-Layered.py
│ │ └── Qwen-Image.py
│ ├── wanvideo/
│ │ ├── README.md
│ │ ├── acceleration/
│ │ │ └── unified_sequence_parallel.py
│ │ ├── model_inference/
│ │ │ ├── LongCat-Video.py
│ │ │ ├── Video-As-Prompt-Wan2.1-14B.py
│ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.py
│ │ │ ├── Wan2.1-FLF2V-14B-720P.py
│ │ │ ├── Wan2.1-Fun-1.3B-Control.py
│ │ │ ├── Wan2.1-Fun-1.3B-InP.py
│ │ │ ├── Wan2.1-Fun-14B-Control.py
│ │ │ ├── Wan2.1-Fun-14B-InP.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.py
│ │ │ ├── Wan2.1-I2V-14B-480P.py
│ │ │ ├── Wan2.1-I2V-14B-720P.py
│ │ │ ├── Wan2.1-T2V-1.3B.py
│ │ │ ├── Wan2.1-T2V-14B.py
│ │ │ ├── Wan2.1-VACE-1.3B-Preview.py
│ │ │ ├── Wan2.1-VACE-1.3B.py
│ │ │ ├── Wan2.1-VACE-14B.py
│ │ │ ├── Wan2.2-Animate-14B.py
│ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.py
│ │ │ ├── Wan2.2-Fun-A14B-Control.py
│ │ │ ├── Wan2.2-Fun-A14B-InP.py
│ │ │ ├── Wan2.2-I2V-A14B.py
│ │ │ ├── Wan2.2-S2V-14B.py
│ │ │ ├── Wan2.2-S2V-14B_multi_clips.py
│ │ │ ├── Wan2.2-T2V-A14B.py
│ │ │ ├── Wan2.2-TI2V-5B.py
│ │ │ ├── Wan2.2-VACE-Fun-A14B.py
│ │ │ ├── WanToDance-14B-global.py
│ │ │ ├── WanToDance-14B-local.py
│ │ │ └── krea-realtime-video.py
│ │ ├── model_inference_low_vram/
│ │ │ ├── LongCat-Video.py
│ │ │ ├── Video-As-Prompt-Wan2.1-14B.py
│ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.py
│ │ │ ├── Wan2.1-FLF2V-14B-720P.py
│ │ │ ├── Wan2.1-Fun-1.3B-Control.py
│ │ │ ├── Wan2.1-Fun-1.3B-InP.py
│ │ │ ├── Wan2.1-Fun-14B-Control.py
│ │ │ ├── Wan2.1-Fun-14B-InP.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.py
│ │ │ ├── Wan2.1-I2V-14B-480P.py
│ │ │ ├── Wan2.1-I2V-14B-720P.py
│ │ │ ├── Wan2.1-T2V-1.3B.py
│ │ │ ├── Wan2.1-T2V-14B.py
│ │ │ ├── Wan2.1-VACE-1.3B-Preview.py
│ │ │ ├── Wan2.1-VACE-1.3B.py
│ │ │ ├── Wan2.1-VACE-14B.py
│ │ │ ├── Wan2.2-Animate-14B.py
│ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.py
│ │ │ ├── Wan2.2-Fun-A14B-Control.py
│ │ │ ├── Wan2.2-Fun-A14B-InP.py
│ │ │ ├── Wan2.2-I2V-A14B.py
│ │ │ ├── Wan2.2-S2V-14B.py
│ │ │ ├── Wan2.2-S2V-14B_multi_clips.py
│ │ │ ├── Wan2.2-T2V-A14B.py
│ │ │ ├── Wan2.2-TI2V-5B.py
│ │ │ ├── Wan2.2-VACE-Fun-A14B.py
│ │ │ ├── WanToDance-14B-global.py
│ │ │ ├── WanToDance-14B-local.py
│ │ │ └── krea-realtime-video.py
│ │ └── model_training/
│ │ ├── full/
│ │ │ ├── LongCat-Video.sh
│ │ │ ├── Video-As-Prompt-Wan2.1-14B.sh
│ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.sh
│ │ │ ├── Wan2.1-FLF2V-14B-720P.sh
│ │ │ ├── Wan2.1-Fun-1.3B-Control.sh
│ │ │ ├── Wan2.1-Fun-1.3B-InP.sh
│ │ │ ├── Wan2.1-Fun-14B-Control.sh
│ │ │ ├── Wan2.1-Fun-14B-InP.sh
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.sh
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.sh
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.sh
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.sh
│ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.sh
│ │ │ ├── Wan2.1-I2V-14B-480P.sh
│ │ │ ├── Wan2.1-I2V-14B-720P.sh
│ │ │ ├── Wan2.1-T2V-1.3B.sh
│ │ │ ├── Wan2.1-T2V-14B.sh
│ │ │ ├── Wan2.1-VACE-1.3B-Preview.sh
│ │ │ ├── Wan2.1-VACE-1.3B.sh
│ │ │ ├── Wan2.1-VACE-14B.sh
│ │ │ ├── Wan2.2-Animate-14B.sh
│ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.sh
│ │ │ ├── Wan2.2-Fun-A14B-Control.sh
│ │ │ ├── Wan2.2-Fun-A14B-InP.sh
│ │ │ ├── Wan2.2-I2V-A14B.sh
│ │ │ ├── Wan2.2-S2V-14B.sh
│ │ │ ├── Wan2.2-T2V-A14B.sh
│ │ │ ├── Wan2.2-TI2V-5B.sh
│ │ │ ├── Wan2.2-VACE-Fun-A14B.sh
│ │ │ ├── WanToDance-14B-global.sh
│ │ │ ├── WanToDance-14B-local.sh
│ │ │ ├── accelerate_config_14B.yaml
│ │ │ ├── accelerate_config_zero3.yaml
│ │ │ └── krea-realtime-video.sh
│ │ ├── lora/
│ │ │ ├── LongCat-Video.sh
│ │ │ ├── Video-As-Prompt-Wan2.1-14B.sh
│ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.sh
│ │ │ ├── Wan2.1-FLF2V-14B-720P.sh
│ │ │ ├── Wan2.1-Fun-1.3B-Control.sh
│ │ │ ├── Wan2.1-Fun-1.3B-InP.sh
│ │ │ ├── Wan2.1-Fun-14B-Control.sh
│ │ │ ├── Wan2.1-Fun-14B-InP.sh
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.sh
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.sh
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.sh
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.sh
│ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.sh
│ │ │ ├── Wan2.1-I2V-14B-480P.sh
│ │ │ ├── Wan2.1-I2V-14B-720P.sh
│ │ │ ├── Wan2.1-T2V-1.3B.sh
│ │ │ ├── Wan2.1-T2V-14B.sh
│ │ │ ├── Wan2.1-VACE-1.3B-Preview.sh
│ │ │ ├── Wan2.1-VACE-1.3B.sh
│ │ │ ├── Wan2.1-VACE-14B.sh
│ │ │ ├── Wan2.2-Animate-14B.sh
│ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.sh
│ │ │ ├── Wan2.2-Fun-A14B-Control.sh
│ │ │ ├── Wan2.2-Fun-A14B-InP.sh
│ │ │ ├── Wan2.2-I2V-A14B.sh
│ │ │ ├── Wan2.2-S2V-14B.sh
│ │ │ ├── Wan2.2-T2V-A14B.sh
│ │ │ ├── Wan2.2-TI2V-5B.sh
│ │ │ ├── Wan2.2-VACE-Fun-A14B.sh
│ │ │ ├── WanToDance-14B-global.sh
│ │ │ ├── WanToDance-14B-local.sh
│ │ │ └── krea-realtime-video.sh
│ │ ├── special/
│ │ │ ├── direct_distill/
│ │ │ │ ├── Wan2.1-T2V-1.3B.sh
│ │ │ │ └── validate.py
│ │ │ ├── fp8_training/
│ │ │ │ ├── Wan2.1-I2V-14B-480P.sh
│ │ │ │ └── validate.py
│ │ │ ├── low_vram_training/
│ │ │ │ ├── Wan2.1-I2V-14B-480P.sh
│ │ │ │ └── validate.py
│ │ │ ├── npu_training/
│ │ │ │ ├── Wan2.1-T2V-14B-NPU.sh
│ │ │ │ ├── Wan2.2-T2V-A14B-NPU.sh
│ │ │ │ └── Wan2.2-VACE-Fun-A14B-NPU.sh
│ │ │ └── split_training/
│ │ │ ├── Wan2.1-I2V-14B-480P.sh
│ │ │ └── validate.py
│ │ ├── train.py
│ │ ├── validate_full/
│ │ │ ├── LongCat-Video.py
│ │ │ ├── Video-As-Prompt-Wan2.1-14B.py
│ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.py
│ │ │ ├── Wan2.1-FLF2V-14B-720P.py
│ │ │ ├── Wan2.1-Fun-1.3B-Control.py
│ │ │ ├── Wan2.1-Fun-1.3B-InP.py
│ │ │ ├── Wan2.1-Fun-14B-Control.py
│ │ │ ├── Wan2.1-Fun-14B-InP.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py
│ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.py
│ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.py
│ │ │ ├── Wan2.1-I2V-14B-480P.py
│ │ │ ├── Wan2.1-I2V-14B-720P.py
│ │ │ ├── Wan2.1-T2V-1.3B.py
│ │ │ ├── Wan2.1-T2V-14B.py
│ │ │ ├── Wan2.1-VACE-1.3B-Preview.py
│ │ │ ├── Wan2.1-VACE-1.3B.py
│ │ │ ├── Wan2.1-VACE-14B.py
│ │ │ ├── Wan2.2-Animate-14B.py
│ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.py
│ │ │ ├── Wan2.2-Fun-A14B-Control.py
│ │ │ ├── Wan2.2-Fun-A14B-InP.py
│ │ │ ├── Wan2.2-I2V-A14B.py
│ │ │ ├── Wan2.2-S2V-14B.py
│ │ │ ├── Wan2.2-T2V-A14B.py
│ │ │ ├── Wan2.2-TI2V-5B.py
│ │ │ ├── Wan2.2-VACE-Fun-A14B.py
│ │ │ ├── WanToDance-14B-global.py
│ │ │ ├── WanToDance-14B-local.py
│ │ │ └── krea-realtime-video.py
│ │ └── validate_lora/
│ │ ├── LongCat-Video.py
│ │ ├── Video-As-Prompt-Wan2.1-14B.py
│ │ ├── Wan2.1-1.3b-speedcontrol-v1.py
│ │ ├── Wan2.1-FLF2V-14B-720P.py
│ │ ├── Wan2.1-Fun-1.3B-Control.py
│ │ ├── Wan2.1-Fun-1.3B-InP.py
│ │ ├── Wan2.1-Fun-14B-Control.py
│ │ ├── Wan2.1-Fun-14B-InP.py
│ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
│ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py
│ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py
│ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py
│ │ ├── Wan2.1-Fun-V1.1-14B-Control.py
│ │ ├── Wan2.1-Fun-V1.1-14B-InP.py
│ │ ├── Wan2.1-I2V-14B-480P.py
│ │ ├── Wan2.1-I2V-14B-720P.py
│ │ ├── Wan2.1-T2V-1.3B.py
│ │ ├── Wan2.1-T2V-14B.py
│ │ ├── Wan2.1-VACE-1.3B-Preview.py
│ │ ├── Wan2.1-VACE-1.3B.py
│ │ ├── Wan2.1-VACE-14B.py
│ │ ├── Wan2.2-Animate-14B.py
│ │ ├── Wan2.2-Fun-A14B-Control-Camera.py
│ │ ├── Wan2.2-Fun-A14B-Control.py
│ │ ├── Wan2.2-Fun-A14B-InP.py
│ │ ├── Wan2.2-I2V-A14B.py
│ │ ├── Wan2.2-S2V-14B.py
│ │ ├── Wan2.2-T2V-A14B.py
│ │ ├── Wan2.2-TI2V-5B.py
│ │ ├── Wan2.2-VACE-Fun-A14B.py
│ │ ├── WanToDance-14B-global.py
│ │ ├── WanToDance-14B-local.py
│ │ └── krea-realtime-video.py
│ └── z_image/
│ ├── README.md
│ ├── model_inference/
│ │ ├── Z-Image-Omni-Base-i2L.py
│ │ ├── Z-Image-Omni-Base.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py
│ │ ├── Z-Image-Turbo.py
│ │ ├── Z-Image-i2L.py
│ │ └── Z-Image.py
│ ├── model_inference_low_vram/
│ │ ├── Z-Image-Omni-Base-i2L.py
│ │ ├── Z-Image-Omni-Base.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py
│ │ ├── Z-Image-Turbo.py
│ │ ├── Z-Image-i2L.py
│ │ └── Z-Image.py
│ └── model_training/
│ ├── full/
│ │ ├── Z-Image-Omni-Base.sh
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh
│ │ ├── Z-Image-Turbo.sh
│ │ ├── Z-Image.sh
│ │ ├── accelerate_config.yaml
│ │ └── accelerate_config_zero3.yaml
│ ├── lora/
│ │ ├── Z-Image-Omni-Base.sh
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh
│ │ ├── Z-Image-Turbo.sh
│ │ └── Z-Image.sh
│ ├── special/
│ │ ├── differential_training/
│ │ │ ├── Z-Image-Turbo.sh
│ │ │ └── validate.py
│ │ ├── npu_training/
│ │ │ └── Z-Image-Turbo-NPU.sh
│ │ └── trajectory_imitation/
│ │ ├── Z-Image-Turbo.sh
│ │ └── validate.py
│ ├── train.py
│ ├── validate_full/
│ │ ├── Z-Image-Omni-Base.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py
│ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py
│ │ ├── Z-Image-Turbo.py
│ │ └── Z-Image.py
│ └── validate_lora/
│ ├── Z-Image-Omni-Base.py
│ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py
│ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py
│ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py
│ ├── Z-Image-Turbo.py
│ └── Z-Image.py
└── pyproject.toml
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/publish.yaml
================================================
name: release
on:
push:
tags:
- 'v**'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-publish
cancel-in-progress: true
jobs:
build-n-publish:
runs-on: ubuntu-20.04
#if: startsWith(github.event.ref, 'refs/tags')
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Install wheel
run: pip install wheel==0.44.0 && pip install -r requirements.txt
- name: Build DiffSynth
run: python -m build
- name: Publish package to PyPI
run: |
pip install twine
twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }}
================================================
FILE: .gitignore
================================================
/data
/models
/scripts
/diffusers
/.vscode
*.pkl
*.safetensors
*.pth
*.ckpt
*.pt
*.bin
*.DS_Store
*.msc
*.mv
log*.txt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [2023] [Zhongjie Duan]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# DiffSynth-Studio
<a href="https://github.com/modelscope/DiffSynth-Studio"><img src=".github/workflows/logo.gif" title="Logo" style="max-width:100%;" width="55" /></a> <a href="https://trendshift.io/repositories/10946" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10946" alt="modelscope%2FDiffSynth-Studio | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a></p>
[](https://pypi.org/project/DiffSynth/)
[](https://github.com/modelscope/DiffSynth-Studio/blob/master/LICENSE)
[](https://github.com/modelscope/DiffSynth-Studio/issues)
[](https://GitHub.com/modelscope/DiffSynth-Studio/pull/)
[](https://GitHub.com/modelscope/DiffSynth-Studio/commit/)
[切换到中文版](./README_zh.md)
## Introduction
> DiffSynth-Studio Documentation: [中文版](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/)、[English version](https://diffsynth-studio-doc.readthedocs.io/en/latest/)
Welcome to the magical world of Diffusion models! DiffSynth-Studio is an open-source Diffusion model engine developed and maintained by the [ModelScope Community](https://www.modelscope.cn/). We hope to foster technological innovation through framework construction, aggregate the power of the open-source community, and explore the boundaries of generative model technology!
DiffSynth currently includes two open-source projects:
* [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): Focused on aggressive technical exploration, targeting academia, and providing cutting-edge model capability support.
* [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine): Focused on stable model deployment, targeting industry, and providing higher computational performance and more stable features.
[DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) and [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine) are the core engines of the ModelScope AIGC zone. Welcome to experience our carefully crafted productized features:
* ModelScope AIGC Zone (for Chinese users): https://modelscope.cn/aigc/home
* ModelScope Civision (for global users): https://modelscope.ai/civision/home
We believe that a well-developed open-source code framework can lower the threshold for technical exploration. We have achieved many [interesting technologies](#innovative-achievements) based on this codebase. Perhaps you also have many wild ideas, and with DiffSynth-Studio, you can quickly realize these ideas. For this reason, we have prepared detailed documentation for developers. We hope that through these documents, developers can understand the principles of Diffusion models, and we look forward to expanding the boundaries of technology together with you.
## Update History
> DiffSynth-Studio has undergone major version updates, and some old features are no longer maintained. If you need to use old features, please switch to the [last historical version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3) before the major version update.
> Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher) and [mi804](https://github.com/mi804). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand.
- **January 19, 2026**: Added support for [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) and [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/Wan.md) and [example code](/examples/mova/) are now available.
- **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/en/Model_Details/LTX-2.md) and [code](/examples/ltx2/).
- **March 3, 2026**: We released the [DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2) model, which is an updated version of Qwen-Image-Layered-Control. In addition to the originally supported text-guided functionality, it adds brush-controlled layer separation capabilities.
- **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates.
<details>
<summary>More</summary>
- **February 26, 2026** Added full and lora training support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details.
- **February 10, 2026** Added inference support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details. Support for model training will be implemented in the future.
- **February 2, 2026** The first document of the Research Tutorial series is now available, guiding you through training a small 0.1B text-to-image model from scratch. For details, see the [documentation](/docs/en/Research_Tutorial/train_from_scratch.md) and [model](https://modelscope.cn/models/DiffSynth-Studio/AAAMyModel). We hope DiffSynth-Studio can evolve into a more powerful training framework for Diffusion models.
- **January 27, 2026**: [Z-Image](https://modelscope.cn/models/Tongyi-MAI/Z-Image) is released, and our [Z-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-i2L) model is released concurrently. You can use it in [ModelScope Studios](https://modelscope.cn/studios/DiffSynth-Studio/Z-Image-i2L). For details, see the [documentation](/docs/zh/Model_Details/Z-Image.md).
- **January 19, 2026**: Added support for [FLUX.2-klein-4B](https://modelscope.cn/models/black-forest-labs/FLUX.2-klein-4B) and [FLUX.2-klein-9B](https://modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B) models, including training and inference capabilities. [Documentation](/docs/en/Model_Details/FLUX2.md) and [example code](/examples/flux2/) are now available.
- **January 12, 2026**: We trained and open-sourced a text-guided image layer separation model ([Model Link](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control)). Given an input image and a textual description, the model isolates the image layer corresponding to the described content. For more details, please refer to our blog post ([Chinese version](https://modelscope.cn/learn/4938), [English version](https://huggingface.co/blog/kelseye/qwen-image-layered-control)).
- **December 24, 2025**: Based on Qwen-Image-Edit-2511, we trained an In-Context Editing LoRA model ([Model Link](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-2511-ICEdit-LoRA)). This model takes three images as input (Image A, Image B, and Image C), and automatically analyzes the transformation from Image A to Image B, then applies the same transformation to Image C to generate Image D. For more details, please refer to our blog post ([Chinese version](https://mp.weixin.qq.com/s/41aEiN3lXKGCJs1-we4Q2g), [English version](https://huggingface.co/blog/kelseye/qwen-image-edit-2511-icedit-lora)).
- **December 9, 2025** We release a wild model based on DiffSynth-Studio 2.0: [Qwen-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-i2L) (Image-to-LoRA). This model takes an image as input and outputs a LoRA. Although this version still has significant room for improvement in terms of generalization, detail preservation, and other aspects, we are open-sourcing these models to inspire more innovative research. For more details, please refer to our [blog](https://huggingface.co/blog/kelseye/qwen-image-i2l).
- **December 4, 2025** DiffSynth-Studio 2.0 released! Many new features online
- [Documentation](/docs/en/README.md) online: Our documentation is still continuously being optimized and updated
- [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md) module upgraded, supporting layer-level disk offload, releasing both memory and VRAM simultaneously
- New model support
- Z-Image Turbo: [Model](https://www.modelscope.ai/models/Tongyi-MAI/Z-Image-Turbo), [Documentation](/docs/en/Model_Details/Z-Image.md), [Code](/examples/z_image/)
- FLUX.2-dev: [Model](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev), [Documentation](/docs/en/Model_Details/FLUX2.md), [Code](/examples/flux2/)
- Training framework upgrade
- [Split Training](/docs/zh/Training/Split_Training.md): Supports automatically splitting the training process into two stages: data processing and training (even for training ControlNet or any other model). Computations that do not require gradient backpropagation, such as text encoding and VAE encoding, are performed during the data processing stage, while other computations are handled during the training stage. Faster speed, less VRAM requirement.
- [Differential LoRA Training](/docs/zh/Training/Differential_LoRA.md): This is a training technique we used in [ArtAug](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1), now available for LoRA training of any model.
- [FP8 Training](/docs/zh/Training/FP8_Precision.md): FP8 can be applied to any non-training model during training, i.e., models with gradients turned off or gradients that only affect LoRA weights.
- **November 4, 2025** Supported the [ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B) model, which is trained based on Wan 2.1 and supports generating corresponding actions based on reference videos.
- **October 30, 2025** Supported the [meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video) model, which supports text-to-video, image-to-video, and video continuation. This model uses the Wan framework for inference and training in this project.
- **October 27, 2025** Supported the [krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video) model, adding another member to the Wan model ecosystem.
- **September 23, 2025** [DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster) released! This model was jointly developed and open-sourced by us and Taobao Experience Design Team. Built upon Qwen-Image, the model is specifically designed for e-commerce poster scenarios, supporting precise partition layout control. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py).
- **September 9, 2025** Our training framework supports various training modes. Currently adapted for Qwen-Image, in addition to the standard SFT training mode, Direct Distill is now supported. Please refer to [our sample code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh). This feature is experimental, and we will continue to improve it to support more comprehensive model training functions.
- **August 28, 2025** We support Wan2.2-S2V, an audio-driven cinematic video generation model. See [./examples/wanvideo/](./examples/wanvideo/).
- **August 21, 2025** [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2) released! Compared to the V1 version, the training dataset has been changed to [Qwen-Image-Self-Generated-Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Qwen-Image-Self-Generated-Dataset), so the generated images better conform to Qwen-Image's own image distribution and style. Please refer to [our sample code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py).
- **August 21, 2025** We open-sourced the [DiffSynth-Studio/Qwen-Image-In-Context-Control-Union](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-In-Context-Control-Union) structural control LoRA model, adopting the In Context technical route, supporting multiple categories of structural control conditions, including canny, depth, lineart, softedge, normal, and openpose. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-In-Context-Control-Union.py).
- **August 20, 2025** We open-sourced the [DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix) model, improving the editing effect of Qwen-Image-Edit on low-resolution image inputs. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py)
- **August 19, 2025** 🔥 Qwen-Image-Edit open-sourced, welcome a new member to the image editing model family!
- **August 18, 2025** We trained and open-sourced the Qwen-Image inpainting ControlNet model [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint). The model structure adopts a lightweight design. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py).
- **August 15, 2025** We open-sourced the [Qwen-Image-Self-Generated-Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Qwen-Image-Self-Generated-Dataset) dataset. This is an image dataset generated using the Qwen-Image model, containing 160,000 `1024 x 1024` images. It includes general, English text rendering, and Chinese text rendering subsets. We provide annotations for image descriptions, entities, and structural control images for each image. Developers can use this dataset to train Qwen-Image models' ControlNet and EliGen models. We aim to promote technological development through open-sourcing!
- **August 13, 2025** We trained and open-sourced the Qwen-Image ControlNet model [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth). The model structure adopts a lightweight design. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py).
- **August 12, 2025** We trained and open-sourced the Qwen-Image ControlNet model [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny). The model structure adopts a lightweight design. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py).
- **August 11, 2025** We open-sourced the distilled acceleration model [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA) for Qwen-Image, following the same training process as [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full), but the model structure has been modified to LoRA, thus being better compatible with other open-source ecosystem models.
- **August 7, 2025** We open-sourced the entity control LoRA model [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen) for Qwen-Image. Qwen-Image-EliGen can achieve entity-level controlled text-to-image generation. Technical details can be found in [the paper](https://arxiv.org/abs/2501.01097). Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet).
- **August 5, 2025** We open-sourced the distilled acceleration model [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) for Qwen-Image, achieving approximately 5x acceleration.
- **August 4, 2025** 🔥 Qwen-Image open-sourced, welcome a new member to the image generation model family!
- **August 1, 2025** [FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev) open-sourced, a text-to-image model focused on aesthetic photography. We provided comprehensive support in a timely manner, including low VRAM layer-by-layer offload, LoRA training, and full training. For more details, please refer to [./examples/flux/](./examples/flux/).
- **July 28, 2025** Wan 2.2 open-sourced. We provided comprehensive support in a timely manner, including low VRAM layer-by-layer offload, FP8 quantization, sequence parallelism, LoRA training, and full training. For more details, please refer to [./examples/wanvideo/](./examples/wanvideo/).
- **July 11, 2025** We propose Nexus-Gen, a unified framework that combines the language reasoning capabilities of Large Language Models (LLMs) with the image generation capabilities of diffusion models. This framework supports seamless image understanding, generation, and editing tasks.
- Paper: [Nexus-Gen: Unified Image Understanding, Generation, and Editing via Prefilled Autoregression in Shared Embedding Space](https://arxiv.org/pdf/2504.21356)
- GitHub Repository: https://github.com/modelscope/Nexus-Gen
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Nexus-GenV2), [HuggingFace](https://huggingface.co/modelscope/Nexus-GenV2)
- Training Dataset: [ModelScope Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Nexus-Gen-Training-Dataset)
- Online Experience: [ModelScope Nexus-Gen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/Nexus-Gen)
- **June 15, 2025** ModelScope's official evaluation framework [EvalScope](https://github.com/modelscope/evalscope) now supports text-to-image generation evaluation. Please refer to the [best practices](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/t2i_eval.html) guide to try it out.
- **March 25, 2025** Our new open-source project [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine) is now open-sourced! Focused on stable model deployment, targeting industry, providing better engineering support, higher computational performance, and more stable features.
- **March 31, 2025** We support InfiniteYou, a face feature preservation method for FLUX. More details can be found in [./examples/InfiniteYou/](./examples/InfiniteYou/).
- **March 13, 2025** We support HunyuanVideo-I2V, the image-to-video generation version of Tencent's open-source HunyuanVideo. More details can be found in [./examples/HunyuanVideo/](./examples/HunyuanVideo/).
- **February 25, 2025** We support Wan-Video, a series of state-of-the-art video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
- **February 17, 2025** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! Advanced video synthesis model! See [./examples/stepvideo](./examples/stepvideo/).
- **December 31, 2024** We propose EliGen, a new framework for entity-level controlled text-to-image generation, supplemented with an inpainting fusion pipeline, extending its capabilities to image inpainting tasks. EliGen can seamlessly integrate existing community models such as IP-Adapter and In-Context LoRA, enhancing their versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/).
- Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen), [HuggingFace](https://huggingface.co/modelscope/EliGen)
- Online Experience: [ModelScope EliGen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/EliGen)
- Training Dataset: [EliGen Train Set](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)
- **December 19, 2024** We implemented advanced VRAM management for HunyuanVideo, enabling video generation with resolutions of 129x720x1280 on 24GB VRAM or 129x512x384 on just 6GB VRAM. More details can be found in [./examples/HunyuanVideo/](./examples/HunyuanVideo/).
- **December 18, 2024** We propose ArtAug, a method to improve text-to-image models through synthesis-understanding interaction. We trained an ArtAug enhancement module for FLUX.1-dev in LoRA format. This model incorporates the aesthetic understanding of Qwen2-VL-72B into FLUX.1-dev, thereby improving the quality of generated images.
- Paper: https://arxiv.org/abs/2412.12888
- Example: https://github.com/modelscope/DiffSynth-Studio/tree/main/examples/ArtAug
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1), [HuggingFace](https://huggingface.co/ECNU-CILab/ArtAug-lora-FLUX.1dev-v1)
- Demo: [ModelScope](https://modelscope.cn/aigc/imageGeneration?tab=advanced&versionId=7228&modelType=LoRA&sdVersion=FLUX_1&modelUrl=modelscope%3A%2F%2FDiffSynth-Studio%2FArtAug-lora-FLUX.1dev-v1%3Frevision%3Dv1.0), HuggingFace (coming soon)
- **October 25, 2024** We provide extensive FLUX ControlNet support. This project supports many different ControlNet models and can be freely combined, even if their structures are different. Additionally, ControlNet models are compatible with high-resolution optimization and partition control technologies, enabling very powerful controllable image generation. See [`./examples/ControlNet/`](./examples/ControlNet/).
- **October 8, 2024** We released extended LoRAs based on CogVideoX-5B and ExVideo. You can download this model from [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1) or [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1).
- **August 22, 2024** This project now supports CogVideoX-5B. See [here](/examples/video_synthesis/). We provide several interesting features for this text-to-video model, including:
- Text-to-video
- Video editing
- Self super-resolution
- Video interpolation
- **August 22, 2024** We implemented an interesting brush feature that supports all text-to-image models. Now you can create stunning images with the assistance of AI using the brush!
- Use it in our [WebUI](#usage-in-webui).
- **August 21, 2024** DiffSynth-Studio now supports FLUX.
- Enable CFG and high-resolution inpainting to improve visual quality. See [here](/examples/image_synthesis/README.md)
- LoRA, ControlNet, and other addon models will be released soon.
- **June 21, 2024** We propose ExVideo, a post-training fine-tuning technique aimed at enhancing the capabilities of video generation models. We extended Stable Video Diffusion to achieve long video generation of up to 128 frames.
- [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
- Source code has been released in this repository. See [`examples/ExVideo`](./examples/ExVideo/).
- Model has been released at [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) and [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1).
- Technical report has been released at [arXiv](https://arxiv.org/abs/2406.14130).
- You can try ExVideo in this [demo](https://huggingface.co/spaces/modelscope/ExVideo-SVD-128f-v1)!
- **June 13, 2024** DiffSynth Studio has migrated to ModelScope. The development team has also transitioned from "me" to "us". Of course, I will still participate in subsequent development and maintenance work.
- **January 29, 2024** We propose Diffutoon, an excellent cartoon coloring solution.
- [Project Page](https://ecnu-cilab.github.io/DiffutoonProjectPage/)
- Source code has been released in this project.
- Technical report (IJCAI 2024) has been released at [arXiv](https://arxiv.org/abs/2401.16224).
- **December 8, 2023** We decided to initiate a new project aimed at unleashing the potential of diffusion models, especially in video synthesis. The development work of this project officially began.
- **November 15, 2023** We propose FastBlend, a powerful video deflickering algorithm.
- sd-webui extension has been released at [GitHub](https://github.com/Artiprocher/sd-webui-fastblend).
- Demonstration videos have been showcased on Bilibili, including three tasks:
- [Video Deflickering](https://www.bilibili.com/video/BV1d94y1W7PE)
- [Video Interpolation](https://www.bilibili.com/video/BV1Lw411m71p)
- [Image-Driven Video Rendering](https://www.bilibili.com/video/BV1RB4y1Z7LF)
- Technical report has been released at [arXiv](https://arxiv.org/abs/2311.09265).
- Unofficial ComfyUI extensions developed by other users have been released at [GitHub](https://github.com/AInseven/ComfyUI-fastblend).
- **October 1, 2023** We released an early version of the project named FastSDXL. This was an initial attempt to build a diffusion engine.
- Source code has been released at [GitHub](https://github.com/Artiprocher/FastSDXL).
- FastSDXL includes a trainable OLSS scheduler to improve efficiency.
- The original repository of OLSS is located [here](https://github.com/alibaba/EasyNLP/tree/master/diffusion/olss_scheduler).
- Technical report (CIKM 2023) has been released at [arXiv](https://arxiv.org/abs/2305.14677).
- Demonstration video has been released at [Bilibili](https://www.bilibili.com/video/BV1w8411y7uj).
- Since OLSS requires additional training, we did not implement it in this project.
- **August 29, 2023** We propose DiffSynth, a video synthesis framework.
- [Project Page](https://ecnu-cilab.github.io/DiffSynth.github.io/).
- Source code has been released at [EasyNLP](https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth).
- Technical report (ECML PKDD 2024) has been released at [arXiv](https://arxiv.org/abs/2308.03463).
</details>
## Installation
Install from source (recommended):
```
git clone https://github.com/modelscope/DiffSynth-Studio.git
cd DiffSynth-Studio
pip install -e .
```
For more installation methods and instructions for non-NVIDIA GPUs, please refer to the [Installation Guide](/docs/en/Pipeline_Usage/Setup.md).
</details>
## Basic Framework
DiffSynth-Studio redesigns the inference and training pipelines for mainstream Diffusion models (including FLUX, Wan, etc.), enabling efficient memory management and flexible model training.
<details>
<summary>Environment Variable Configuration</summary>
> Before running model inference or training, you can configure settings such as the model download source via [environment variables](/docs/en/Pipeline_Usage/Environment_Variables.md).
>
> By default, this project downloads models from ModelScope. For users outside China, you can configure the system to download models from the ModelScope international site as follows:
>
> ```python
> import os
> os.environ["MODELSCOPE_DOMAIN"] = "www.modelscope.ai"
> ```
>
> To download models from other sources, please modify the environment variable [DIFFSYNTH_DOWNLOAD_SOURCE](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_download_source).
</details>
### Image Synthesis

#### Z-Image: [/docs/en/Model_Details/Z-Image.md](/docs/en/Model_Details/Z-Image.md)
<details>
<summary>Quick Start</summary>
Running the following code will quickly load the [Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo) model for inference. FP8 quantization significantly degrades image quality, so we do not recommend enabling any quantization for the Z-Image Turbo model. CPU offloading is recommended, and the model can run with as little as 8 GB of GPU memory.
```python
from diffsynth.pipelines.z_image import ZImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": torch.bfloat16,
"offload_device": "cpu",
"onload_dtype": torch.bfloat16,
"onload_device": "cpu",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = ZImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="transformer/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="tokenizer/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
image = pipe(prompt=prompt, seed=42, rand_device="cuda")
image.save("image.jpg")
```
</details>
<details>
<summary>Examples</summary>
Example code for Z-Image is available at: [/examples/z_image/](/examples/z_image/)
|Model ID|Inference|Low VRAM Inference|Full Training|Validation After Full Training|LoRA Training|Validation After LoRA Training|
|-|-|-|-|-|-|-|
|[Tongyi-MAI/Z-Image](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image)|[code](/examples/z_image/model_inference/Z-Image.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image.py)|[code](/examples/z_image/model_training/full/Z-Image.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image.py)|[code](/examples/z_image/model_training/lora/Z-Image.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image.py)|
|[DiffSynth-Studio/Z-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-i2L)|[code](/examples/z_image/model_inference/Z-Image-i2L.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-i2L.py)|-|-|-|-|
|[Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo)|[code](/examples/z_image/model_inference/Z-Image-Turbo.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo.py)|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|
</details>
#### FLUX.2: [/docs/en/Model_Details/FLUX2.md](/docs/en/Model_Details/FLUX2.md)
<details>
<summary>Quick Start</summary>
Running the following code will quickly load the [black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev) model for inference. VRAM management is enabled, and the framework automatically loads model parameters based on available GPU memory. The model can run with as little as 10 GB of VRAM.
```python
from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": torch.float8_e4m3fn,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e4m3fn,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = Flux2ImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="transformer/*.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
],
tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="tokenizer/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "High resolution. A dreamy underwater portrait of a serene young woman in a flowing blue dress. Her hair floats softly around her face, strands delicately suspended in the water. Clear, shimmering light filters through, casting gentle highlights, while tiny bubbles rise around her. Her expression is calm, her features finely detailed—creating a tranquil, ethereal scene."
image = pipe(prompt, seed=42, rand_device="cuda", num_inference_steps=50)
image.save("image.jpg")
```
</details>
<details>
<summary>Examples</summary>
Example code for FLUX.2 is available at: [/examples/flux2/](/examples/flux2/)
| Model ID | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|-|-|-|-|-|-|-|
|[black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev)|[code](/examples/flux2/model_inference/FLUX.2-dev.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-dev.py)|-|-|[code](/examples/flux2/model_training/lora/FLUX.2-dev.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-dev.py)|
|[black-forest-labs/FLUX.2-klein-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-4B.py)|
|[black-forest-labs/FLUX.2-klein-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-9B.py)|
|[black-forest-labs/FLUX.2-klein-base-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-4B.py)|
|[black-forest-labs/FLUX.2-klein-base-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-9B.py)|
</details>
#### Anima: [/docs/en/Model_Details/Anima.md](/docs/en/Model_Details/Anima.md)
<details>
<summary>Quick Start</summary>
Run the following code to quickly load the [circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 8GB VRAM.
```python
from diffsynth.pipelines.anima_image import AnimaImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": "disk",
"onload_device": "disk",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = AnimaImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/diffusion_models/anima-preview.safetensors", **vram_config),
ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/text_encoders/qwen_3_06b_base.safetensors", **vram_config),
ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/vae/qwen_image_vae.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
tokenizer_t5xxl_config=ModelConfig(model_id="stabilityai/stable-diffusion-3.5-large", origin_file_pattern="tokenizer_3/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "Masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait."
negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
image = pipe(prompt, seed=0, num_inference_steps=50)
image.save("image.jpg")
```
</details>
<details>
<summary>Examples</summary>
Example code for Anima is located at: [/examples/anima/](/examples/anima/)
| Model ID | Inference | Low VRAM Inference | Full Training | Validation after Full Training | LoRA Training | Validation after LoRA Training |
|-|-|-|-|-|-|-|
|[circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima)|[code](/examples/anima/model_inference/anima-preview.py)|[code](/examples/anima/model_inference_low_vram/anima-preview.py)|[code](/examples/anima/model_training/full/anima-preview.sh)|[code](/examples/anima/model_training/validate_full/anima-preview.py)|[code](/examples/anima/model_training/lora/anima-preview.sh)|[code](/examples/anima/model_training/validate_lora/anima-preview.py)|
</details>
#### Qwen-Image: [/docs/en/Model_Details/Qwen-Image.md](/docs/en/Model_Details/Qwen-Image.md)
<details>
<summary>Quick Start</summary>
Running the following code will quickly load the [Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) model for inference. VRAM management is enabled, and the framework automatically adjusts model parameter loading based on available GPU memory. The model can run with as little as 8 GB of VRAM.
```python
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": torch.float8_e4m3fn,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e4m3fn,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = QwenImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config),
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。"
image = pipe(prompt, seed=0, num_inference_steps=40)
image.save("image.jpg")
```
</details>
<details>
<summary>Model Lineage</summary>
```mermaid
graph LR;
Qwen/Qwen-Image-->Qwen/Qwen-Image-Edit;
Qwen/Qwen-Image-Edit-->Qwen/Qwen-Image-Edit-2509;
Qwen/Qwen-Image-->EliGen-Series;
EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen;
DiffSynth-Studio/Qwen-Image-EliGen-->DiffSynth-Studio/Qwen-Image-EliGen-V2;
EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen-Poster;
Qwen/Qwen-Image-->Distill-Series;
Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-Full;
Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-LoRA;
Qwen/Qwen-Image-->ControlNet-Series;
ControlNet-Series-->Blockwise-ControlNet-Series;
Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny;
Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth;
Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint;
ControlNet-Series-->DiffSynth-Studio/Qwen-Image-In-Context-Control-Union;
Qwen/Qwen-Image-->DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix;
```
</details>
<details>
<summary>Examples</summary>
Example code for Qwen-Image is available at: [/examples/qwen_image/](/examples/qwen_image/)
| Model ID | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|-|-|-|-|-|-|-|
|[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](/examples/qwen_image/model_inference/Qwen-Image.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image.py)|
|[Qwen/Qwen-Image-2512](https://www.modelscope.cn/models/Qwen/Qwen-Image-2512)|[code](/examples/qwen_image/model_inference/Qwen-Image-2512.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-2512.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-2512.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-2512.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-2512.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-2512.py)|
|[Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit.py)|
|[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py)|
|[Qwen/Qwen-Image-Edit-2511](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2511)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2511.py)|
|[FireRedTeam/FireRed-Image-Edit-1.0](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.0)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.0.py)|
|[FireRedTeam/FireRed-Image-Edit-1.1](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.1)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.1.py)|
|[lightx2v/Qwen-Image-Edit-2511-Lightning](https://modelscope.cn/models/lightx2v/Qwen-Image-Edit-2511-Lightning)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511-Lightning.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511-Lightning.py)|-|-|-|-|
|[Qwen/Qwen-Image-Layered](https://www.modelscope.cn/models/Qwen/Qwen-Image-Layered)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered.py)|
|[DiffSynth-Studio/Qwen-Image-Layered-Control](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control.py)|
|[DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control-V2.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control-V2.py)|
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|[DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|[DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-Poster.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen-Poster.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen-Poster.py)|
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](/examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](/examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-LoRA.py)|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py)|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py)|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint)|[code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Inpaint.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Inpaint.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|
|[DiffSynth-Studio/Qwen-Image-In-Context-Control-Union](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-In-Context-Control-Union)|[code](/examples/qwen_image/model_inference/Qwen-Image-In-Context-Control-Union.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-In-Context-Control-Union.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-In-Context-Control-Union.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-In-Context-Control-Union.py)|
|[DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py)|-|-|-|-|
|[DiffSynth-Studio/Qwen-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-i2L)|[code](/examples/qwen_image/model_inference/Qwen-Image-i2L.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-i2L.py)|-|-|-|-|
</details>
#### FLUX.1: [/docs/en/Model_Details/FLUX.md](/docs/en/Model_Details/FLUX.md)
<details>
<summary>Quick Start</summary>
Running the following code will quickly load the [black-forest-labs/FLUX.1-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-dev) model for inference. VRAM management is enabled, and the framework automatically adjusts model parameter loading based on available GPU memory. The model can run with as little as 8 GB of VRAM.
```python
import torch
from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
vram_config = {
"offload_dtype": torch.float8_e4m3fn,
"offload_device": "cpu",
"onload_dtype": torch.float8_e4m3fn,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e4m3fn,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = FluxImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", **vram_config),
],
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 1,
)
prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
image = pipe(prompt=prompt, seed=0)
image.save("image.jpg")
```
</details>
<details>
<summary>Model Lineage</summary>
```mermaid
graph LR;
FLUX.1-Series-->black-forest-labs/FLUX.1-dev;
FLUX.1-Series-->black-forest-labs/FLUX.1-Krea-dev;
FLUX.1-Series-->black-forest-labs/FLUX.1-Kontext-dev;
black-forest-labs/FLUX.1-dev-->FLUX.1-dev-ControlNet-Series;
FLUX.1-dev-ControlNet-Series-->alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta;
FLUX.1-dev-ControlNet-Series-->InstantX/FLUX.1-dev-Controlnet-Union-alpha;
FLUX.1-dev-ControlNet-Series-->jasperai/Flux.1-dev-Controlnet-Upscaler;
black-forest-labs/FLUX.1-dev-->InstantX/FLUX.1-dev-IP-Adapter;
black-forest-labs/FLUX.1-dev-->ByteDance/InfiniteYou;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Eligen;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev;
black-forest-labs/FLUX.1-dev-->ostris/Flex.2-preview;
black-forest-labs/FLUX.1-dev-->stepfun-ai/Step1X-Edit;
Qwen/Qwen2.5-VL-7B-Instruct-->stepfun-ai/Step1X-Edit;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Nexus-GenV2;
Qwen/Qwen2.5-VL-7B-Instruct-->DiffSynth-Studio/Nexus-GenV2;
```
</details>
<details>
<summary>Examples</summary>
Example code for FLUX.1 is available at: [/examples/flux/](/examples/flux/)
| Model ID | Extra Args | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|-|-|-|-|-|-|-|-|
|[black-forest-labs/FLUX.1-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-dev)||[code](/examples/flux/model_inference/FLUX.1-dev.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev.py)|[code](/examples/flux/model_training/full/FLUX.1-dev.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev.py)|
|[black-forest-labs/FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev)||[code](/examples/flux/model_inference/FLUX.1-Krea-dev.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-Krea-dev.py)|[code](/examples/flux/model_training/full/FLUX.1-Krea-dev.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-Krea-dev.py)|[code](/examples/flux/model_training/lora/FLUX.1-Krea-dev.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-Krea-dev.py)|
|[black-forest-labs/FLUX.1-Kontext-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Kontext-dev)|`kontext_images`|[code](/examples/flux/model_inference/FLUX.1-Kontext-dev.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py)|[code](/examples/flux/model_training/full/FLUX.1-Kontext-dev.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-Kontext-dev.py)|[code](/examples/flux/model_training/lora/FLUX.1-Kontext-dev.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-Kontext-dev.py)|
|[alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta](https://www.modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta)|`controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Inpainting-Beta.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Inpainting-Beta.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|
|[InstantX/FLUX.1-dev-Controlnet-Union-alpha](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha)|`controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Union-alpha.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Union-alpha.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Union-alpha.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Union-alpha.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Union-alpha.py)|
|[jasperai/Flux.1-dev-Controlnet-Upscaler](https://www.modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler)|`controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Upscaler.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Upscaler.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Upscaler.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Upscaler.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Upscaler.py)|
|[InstantX/FLUX.1-dev-IP-Adapter](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-IP-Adapter)|`ipadapter_images`, `ipadapter_scale`|[code](/examples/flux/model_inference/FLUX.1-dev-IP-Adapter.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-IP-Adapter.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-IP-Adapter.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-IP-Adapter.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-IP-Adapter.py)|
|[ByteDance/InfiniteYou](https://www.modelscope.cn/models/ByteDance/InfiniteYou)|`infinityou_id_image`, `infinityou_guidance`, `controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-InfiniteYou.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-InfiniteYou.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-InfiniteYou.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-InfiniteYou.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-InfiniteYou.py)|
|[DiffSynth-Studio/Eligen](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)|`eligen_entity_prompts`, `eligen_entity_masks`, `eligen_enable_on_negative`, `eligen_enable_inpaint`|[code](/examples/flux/model_inference/FLUX.1-dev-EliGen.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-EliGen.py)|-|-|[code](/examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-EliGen.py)|
|[DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev](https://www.modelscope.cn/models/DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev)|`lora_encoder_inputs`, `lora_encoder_scale`|[code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Encoder.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRA-Encoder.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-LoRA-Encoder.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-LoRA-Encoder.py)|-|-|
|[DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev](https://modelscope.cn/models/DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev)||[code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py)|-|-|-|-|-|
|[stepfun-ai/Step1X-Edit](https://www.modelscope.cn/models/stepfun-ai/Step1X-Edit)|`step1x_reference_image`|[code](/examples/flux/model_inference/Step1X-Edit.py)|[code](/examples/flux/model_inference_low_vram/Step1X-Edit.py)|[code](/examples/flux/model_training/full/Step1X-Edit.sh)|[code](/examples/flux/model_training/validate_full/Step1X-Edit.py)|[code](/examples/flux/model_training/lora/Step1X-Edit.sh)|[code](/examples/flux/model_training/validate_lora/Step1X-Edit.py)|
|[ostris/Flex.2-preview](https://www.modelscope.cn/models/ostris/Flex.2-preview)|`flex_inpaint_image`, `flex_inpaint_mask`, `flex_control_image`, `flex_control_strength`, `flex_control_stop`|[code](/examples/flux/model_inference/FLEX.2-preview.py)|[code](/examples/flux/model_inference_low_vram/FLEX.2-preview.py)|[code](/examples/flux/model_training/full/FLEX.2-preview.sh)|[code](/examples/flux/model_training/validate_full/FLEX.2-preview.py)|[code](/examples/flux/model_training/lora/FLEX.2-preview.sh)|[code](/examples/flux/model_training/validate_lora/FLEX.2-preview.py)|
|[DiffSynth-Studio/Nexus-GenV2](https://www.modelscope.cn/models/DiffSynth-Studio/Nexus-GenV2)|`nexus_gen_reference_image`|[code](/examples/flux/model_inference/Nexus-Gen-Editing.py)|[code](/examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py)|[code](/examples/flux/model_training/full/Nexus-Gen.sh)|[code](/examples/flux/model_training/validate_full/Nexus-Gen.py)|[code](/examples/flux/model_training/lora/Nexus-Gen.sh)|[code](/examples/flux/model_training/validate_lora/Nexus-Gen.py)|
</details>
### Video Synthesis
https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
#### LTX-2: [/docs/en/Model_Details/LTX-2.md](/docs/en/Model_Details/LTX-2.md)
<details>
<summary>Quick Start</summary>
Running the following code will quickly load the [Lightricks/LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2) model for inference. VRAM management is enabled, and the framework automatically adjusts model parameter loading based on available GPU memory. The model can run with as little as 8GB of VRAM.
```python
import torch
from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
vram_config = {
"offload_dtype": torch.float8_e5m2,
"offload_device": "cpu",
"onload_dtype": torch.float8_e5m2,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e5m2,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
"""
Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2
Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage
For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors"))
and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported.
We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules,
and avoid redundant memory usage when users only want to use part of the model.
"""
# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading
pipe = LTX2AudioVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors", **vram_config),
ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2"
# pipe = LTX2AudioVideoPipeline.from_pretrained(
# torch_dtype=torch.bfloat16,
# device="cuda",
# model_configs=[
# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
# ],
# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
# stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
# vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
# )
prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\""
negative_prompt = (
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
"off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
)
height, width, num_frames = 512 * 2, 768 * 2, 121
video, audio = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
seed=43,
height=height,
width=width,
num_frames=num_frames,
tiled=True,
use_two_stage_pipeline=True,
)
write_video_audio_ltx2(
video=video,
audio=audio,
output_path='ltx2_twostage.mp4',
fps=24,
audio_sample_rate=24000,
)
```
</details>
<details>
<summary>Examples</summary>
Example code for LTX-2 is available at: [/examples/ltx2/](/examples/ltx2/)
| Model ID | Extra Args | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
|-|-|-|-|-|-|-|-|
|[Lightricks/LTX-2.3: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-I2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-I2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-I2AV.py)|
|[Lightricks/LTX-2.3: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-TwoStage.py)|-|-|-|-|
|[Lightricks/LTX-2.3: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2.3-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-I2AV-DistilledPipeline.py)|-|-|-|-|
|[Lightricks/LTX-2.3: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV.py)|
|[Lightricks/LTX-2.3: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage.py)|-|-|-|-|
|[Lightricks/LTX-2.3: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-DistilledPipeline.py)|-|-|-|-|
|[Lightricks/LTX-2.3: A2V](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py)|-|-|-|-|
|[Lightricks/LTX-2.3: Retake](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_video`,`retake_video_regions`,`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py)|-|-|-|-|
|[Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)|
|[Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)|
|[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py)|
|[Lightricks/LTX-2-19b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV-IC-LoRA.py)|
|[Lightricks/LTX-2-19b-IC-LoRA-Detailer](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-IC-LoRA-Detailer)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2-T2AV-IC-LoRA-Detailer.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-IC-LoRA-Detailer.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV-IC-LoRA.py)|
|[Lightricks/LTX-2: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py)|-|-|-|-|
|[Lightricks/LTX-2: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py)|-|-|-|-|
|[Lightricks/LTX-2: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py)|-|-|-|-|
|[Lightricks/LTX-2: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py)|-|-|-|-|
|[Lightricks/LTX-2: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py)|-|-|-|-|
|[Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-In](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-In)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-Camera-Control-Dolly-In.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-Camera-Control-Dolly-In.py)|-|-|-|-|
|[Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Out](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Out)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-Camera-Control-Dolly-Out.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-Camera-Control-Dolly-Out.py)|-|-|-|-|
|[Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Left](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Left)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-Camera-Control-Dolly-Left.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-Camera-Control-Dolly-Left.py)|-|-|-|-|
|[Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Right](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-Right)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-Camera-Control-Dolly-Right.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-Camera-Control-Dolly-Right.py)|-|-|-|-|
|[Lightricks/LTX-2-19b-LoRA-Camera-Control-Jib-Up](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-LoRA-Camera-Control-Jib-Up)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-Camera-Control-Jib-Up.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-Camera-Control-Jib-Up.py)|-|-|-|-|
|[Lightricks/LTX-2-19b-LoRA-Camera-Control-Jib-Down](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-LoRA-Camera-Control-Jib-Down)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-Camera-Control-Jib-Down.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-Camera-Control-Jib-Down.py)|-|-|-|-|
|[Lightricks/LTX-2-19b-LoRA-Camera-Control-Static](https://www.modelscope.cn/models/Lightricks/LTX-2-19b-LoRA-Camera-Control-Static)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-Camera-Control-Static.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-Camera-Control-Static.py)|-|-|-|-|
</details>
#### Wan: [/docs/en/Model_Details/Wan.md](/docs/en/Model_Details/Wan.md)
<details>
<summary>Quick Start</summary>
Running the following code will quickly load the [Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) model for inference. VRAM management is enabled, and the framework automatically adjusts model parameter loading based on available GPU memory. The model can run with as little as 8 GB of VRAM.
```python
import torch
from diffsynth.utils.data import save_video, VideoData
from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": torch.bfloat16,
"onload_device": "cpu",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = WanVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
)
video = pipe(
prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
seed=0, tiled=True,
)
save_video(video, "video.mp4", fps=15, quality=5)
```
</details>
<details>
<summary>Model Lineage</summary>
```mermaid
graph LR;
Wan-Series-->Wan2.1-Series;
Wan-Series-->Wan2.2-Series;
Wan2.1-Series-->Wan-AI/Wan2.1-T2V-1.3B;
Wan2.1-Series-->Wan-AI/Wan2.1-T2V-14B;
Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-I2V-14B-480P;
Wan-AI/Wan2.1-I2V-14B-480P-->Wan-AI/Wan2.1-I2V-14B-720P;
Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-FLF2V-14B-720P;
Wan-AI/Wan2.1-T2V-1.3B-->iic/VACE-Wan2.1-1.3B-Preview;
iic/VACE-Wan2.1-1.3B-Preview-->Wan-AI/Wan2.1-VACE-1.3B;
Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-VACE-14B;
Wan-AI/Wan2.1-T2V-1.3B-->Wan2.1-Fun-1.3B-Series;
Wan2.1-Fun-1.3B-Series-->PAI/Wan2.1-Fun-1.3B-InP;
Wan2.1-Fun-1.3B-Series-->PAI/Wan2.1-Fun-1.3B-Control;
Wan-AI/Wan2.1-T2V-14B-->Wan2.1-Fun-14B-Series;
Wan2.1-Fun-14B-Series-->PAI/Wan2.1-Fun-14B-InP;
Wan2.1-Fun-14B-Series-->PAI/Wan2.1-Fun-14B-Control;
Wan-AI/Wan2.1-T2V-1.3B-->Wan2.1-Fun-V1.1-1.3B-Series;
Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-Control;
Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-InP;
Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera;
Wan-AI/Wan2.1-T2V-14B-->Wan2.1-Fun-V1.1-14B-Series;
Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-Control;
Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-InP;
Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-Control-Camera;
Wan-AI/Wan2.1-T2V-1.3B-->DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1;
Wan-AI/Wan2.1-T2V-14B-->krea/krea-realtime-video;
Wan-AI/Wan2.1-T2V-14B-->meituan-longcat/LongCat-Video;
Wan-AI/Wan2.1-I2V-14B-720P-->ByteDance/Video-As-Prompt-Wan2.1-14B;
Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.2-Animate-14B;
Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.2-S2V-14B;
Wan2.2-Series-->Wan-AI/Wan2.2-T2V-A14B;
Wan2.2-Series-->Wan-AI/Wan2.2-I2V-A14B;
Wan2.2-Series-->Wan-AI/Wan2.2-TI2V-5B;
Wan-AI/Wan2.2-T2V-A14B-->Wan2.2-Fun-Series;
Wan2.2-Fun-Series-->PAI/Wan2.2-VACE-Fun-A14B;
Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-InP;
Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-Control;
Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-Control-Camera;
```
</details>
<details>
<summary>Examples</summary>
Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/)
| Model ID | Extra Inputs | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training |
|-|-|-|-|-|-|-|-|
|[Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py)|
|[Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py)|
|[Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py)|
|[Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py)|
|[Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py)|
|[iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py)|
|[Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py)|
|[Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py)|
|[PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py)|
|[PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|`control_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py)|
|[PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py)|
|[PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|`control_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py)|
|[PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py)|
|[PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py)|
|[PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py)|
|[PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py)|
|[PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|
|[PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|
|[DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|`motion_bucket_id`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py)|
|[krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/krea-realtime-video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/krea-realtime-video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/krea-realtime-video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py)|
|[meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video)|`longcat_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/LongCat-Video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/LongCat-Video.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/LongCat-Video.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/LongCat-Video.py)|
|[ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B)|`vap_video`, `vap_prompt`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Video-As-Prompt-Wan2.1-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Video-As-Prompt-Wan2.1-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py)|
|[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py)|
|[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py)|
|[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py)|
|[Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B)|`input_image`, `animate_pose_video`, `animate_face_video`, `animate_inpaint_video`, `animate_mask_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Animate-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Animate-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py)|
|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-S2V-14B_multi_clips.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-S2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-S2V-14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py)|
|[PAI/Wan2.2-VACE-Fun-A14B](https://www.modelscope.cn/models/PAI/Wan2.2-VACE-Fun-A14B)|`vace_control_video`, `vace_reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-VACE-Fun-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-VACE-Fun-A14B.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py)|
|[PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP)|`input_image`, `end_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py)|
|[PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control)|`control_video`, `reference_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py)|
|[PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera)|`control_camera_video`, `input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py)|
|[openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-360p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-360P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-360p-I2AV.py)|
|[openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p)|`input_image`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_inference_low_vram/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/full/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_full/MOVA-720p-I2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/lora/MOVA-720P-I2AV.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/mova/model_training/validate_lora/MOVA-720p-I2AV.py)|
|[Wan-AI/WanToDance-14B (global model)](https://modelscope.cn/models/Wan-AI/WanToDance-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/WanToDance-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/WanToDance-14B-global.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/WanToDance-14B-global.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/WanToDance-14B-global.py)|
|[Wan-AI/WanToDance-14B (local model)](https://modelscope.cn/models/Wan-AI/WanToDance-14B)|`wantodance_music_path`, `wantodance_reference_image`, `wantodance_fps`, `wantodance_keyframes`, `wantodance_keyframes_mask`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_inference_low_vram/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/full/WanToDance-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_full/WanToDance-14B-local.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/lora/WanToDance-14B-local.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/wanvideo/model_training/validate_lora/WanToDance-14B-local.py)|
</details>
## Innovative Achievements
DiffSynth-Studio is not just an engineered model framework, but also an incubator for innovative achievements.
<details>
<summary>Spectral Evolution Search: Efficient Inference-Time Scaling for Reward-Aligned Image Generation</summary>
- Paper: [Spectral Evolution Search: Efficient Inference-Time Scaling for Reward-Aligned Image Generation
](https://arxiv.org/abs/2602.03208)
- Sample Code: [/docs/en/Research_Tutorial/inference_time_scaling.md](/docs/en/Research_Tutorial/inference_time_scaling.md)
|FLUX.1-dev|FLUX.1-dev + SES|Qwen-Image|Qwen-Image + SES|
|-|-|-|-|
|||||
</details>
<details>
<summary>VIRAL: Visual In-Context Reasoning via Analogy in Diffusion Transformers</summary>
- Paper: [VIRAL: Visual In-Context Reasoning via Analogy in Diffusion Transformers
](https://arxiv.org/abs/2602.03210)
- Sample code: [/examples/qwen_image/model_inference/Qwen-Image-Edit-2511-ICEdit.py](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511-ICEdit.py)
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-2511-ICEdit-LoRA)
|Example 1|Example 2|Query|Output|
|-|-|-|-|
|||||
</details>
<details>
<summary>AttriCtrl: Attribute Intensity Control for Image Generation Models</summary>
- Paper: [AttriCtrl: Fine-Grained Control of Aesthetic Attribute Intensity in Diffusion Models](https://arxiv.org/abs/2508.02151)
- Sample Code: [/examples/flux/model_inference/FLUX.1-dev-AttriCtrl.py](/examples/flux/model_inference/FLUX.1-dev-AttriCtrl.py)
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/AttriCtrl-FLUX.1-Dev)
|brightness scale = 0.1|brightness scale = 0.3|brightness scale = 0.5|brightness scale = 0.7|brightness scale = 0.9|
|-|-|-|-|-|
||||||
</details>
<details>
<summary>AutoLoRA: Automated LoRA Retrieval and Fusion</summary>
- Paper: [AutoLoRA: Automatic LoRA Retrieval and Fine-Grained Gated Fusion for Text-to-Image Generation](https://arxiv.org/abs/2508.02107)
- Sample Code: [/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py](/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py)
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev)
||[LoRA 1](https://modelscope.cn/models/cancel13/cxsk)|[LoRA 2](https://modelscope.cn/models/wy413928499/xuancai2)|[LoRA 3](https://modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1)|[LoRA 4](https://modelscope.cn/models/hongyanbujian/JPL)|
|-|-|-|-|-|
|[LoRA 1](https://modelscope.cn/models/cancel13/cxsk) |||||
|[LoRA 2](https://modelscope.cn/models/wy413928499/xuancai2) |||||
|[LoRA 3](https://modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1) |||||
|[LoRA 4](https://modelscope.cn/models/hongyanbujian/JPL) |||||
</details>
<details>
<summary>Nexus-Gen: Unified Architecture for Image Understanding, Generation, and Editing</summary>
- Detailed Page: https://github.com/modelscope/Nexus-Gen
- Paper: [Nexus-Gen: Unified Image Understanding, Generation, and Editing via Prefilled Autoregression in Shared Embedding Space](https://arxiv.org/pdf/2504.21356)
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Nexus-GenV2), [HuggingFace](https://huggingface.co/modelscope/Nexus-GenV2)
- Dataset: [ModelScope Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Nexus-Gen-Training-Dataset)
- Online Experience: [ModelScope Nexus-Gen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/Nexus-Gen)

</details>
<details>
<summary>ArtAug: Aesthetic Enhancement for Image Generation Models</summary>
- Detailed Page: [./examples/ArtAug/](./examples/ArtAug/)
- Paper: [ArtAug: Enhancing Text-to-Image Generation through Synthesis-Understanding Interaction](https://arxiv.org/abs/2412.12888)
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1), [HuggingFace](https://huggingface.co/ECNU-CILab/ArtAug-lora-FLUX.1dev-v1)
- Online Experience: [ModelScope AIGC Tab](https://www.modelscope.cn/aigc/imageGeneration?tab=advanced&versionId=7228&modelType=LoRA&sdVersion=FLUX_1&modelUrl=modelscope%3A%2F%2FDiffSynth-Studio%2FArtAug-lora-FLUX.1dev-v1%3Frevision%3Dv1.0)
|FLUX.1-dev|FLUX.1-dev + ArtAug LoRA|
|-|-|
|||
</details>
<details>
<summary>EliGen: Precise Image Partition Control</summary>
- Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
- Sample Code: [/examples/flux/model_inference/FLUX.1-dev-EliGen.py](/examples/flux/model_inference/FLUX.1-dev-EliGen.py)
- Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen), [HuggingFace](https://huggingface.co/modelscope/EliGen)
- Online Experience: [ModelScope EliGen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/EliGen)
- Dataset: [EliGen Train Set](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)
|Entity Control Region|Generated Image|
|-|-|
|||
</details>
<details>
<summary>ExVideo: Extended Training for Video Generation Models</summary>
- Project Page: [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
- Paper: [ExVideo: Extending Video Diffusion Models via Parameter-Efficient Post-Tuning](https://arxiv.org/abs/2406.14130)
- Sample Code: Please refer to the [older version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3/examples/ExVideo)
- Model: [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1), [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5b-9d49-ed6001bb9acc
</details>
<details>
<summary>Diffutoon: High-Resolution Anime-Style Video Rendering</summary>
- Project Page: [Project Page](https://ecnu-cilab.github.io/DiffutoonProjectPage/)
- Paper: [Diffutoon: High-Resolution Editable Toon Shading via Diffusion Models](https://arxiv.org/abs/2401.16224)
- Sample Code: Please refer to the [older version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3/examples/Diffutoon)
https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd
</details>
<details>
<summary>DiffSynth: The Original Version of This Project</summary>
- Project Page: [Project Page](https://ecnu-cilab.github.io/DiffSynth.github.io/)
- Paper: [DiffSynth: Latent In-Iteration Deflickering for Realistic Video Synthesis](https://arxiv.org/abs/2308.03463)
- Sample Code: Please refer to the [older version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3/examples/diffsynth)
https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
</details>
================================================
FILE: README_zh.md
================================================
# DiffSynth-Studio
<a href="https://github.com/modelscope/DiffSynth-Studio"><img src=".github/workflows/logo.gif" title="Logo" style="max-width:100%;" width="55" /></a> <a href="https://trendshift.io/repositories/10946" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10946" alt="modelscope%2FDiffSynth-Studio | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a></p>
[](https://pypi.org/project/DiffSynth/)
[](https://github.com/modelscope/DiffSynth-Studio/blob/master/LICENSE)
[](https://github.com/modelscope/DiffSynth-Studio/issues)
[](https://GitHub.com/modelscope/DiffSynth-Studio/pull/)
[](https://GitHub.com/modelscope/DiffSynth-Studio/commit/)
[Switch to English](./README.md)
## 简介
> DiffSynth-Studio 文档:[中文版](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/)、[English version](https://diffsynth-studio-doc.readthedocs.io/en/latest/)
欢迎来到 Diffusion 模型的魔法世界!DiffSynth-Studio 是由[魔搭社区](https://www.modelscope.cn/)团队开发和维护的开源 Diffusion 模型引擎。我们期望以框架建设孵化技术创新,凝聚开源社区的力量,探索生成式模型技术的边界!
DiffSynth 目前包括两个开源项目:
* [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): 聚焦于激进的技术探索,面向学术界,提供更前沿的模型能力支持。
* [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine): 聚焦于稳定的模型部署,面向工业界,提供更高的计算性能与更稳定的功能。
[DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 与 [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine) 是魔搭社区 AIGC 专区的核心引擎,欢迎体验我们精心打造的产品化功能:
* 魔搭社区 AIGC 专区 (面向中国用户): https://modelscope.cn/aigc/home
* ModelScope Civision (for global users): https://modelscope.ai/civision/home
我们相信,一个完善的开源代码框架能够降低技术探索的门槛,我们基于这个代码库搞出了不少[有意思的技术](#创新成果)。或许你也有许多天马行空的构想,借助 DiffSynth-Studio,你可以快速实现这些想法。为此,我们为开发者准备了详细的文档,我们希望通过这些文档,帮助开发者理解 Diffusion 模型的原理,更期待与你一同拓展技术的边界。
## 更新历史
> DiffSynth-Studio 经历了大版本更新,部分旧功能已停止维护,如需使用旧版功能,请切换到大版本更新前的[最后一个历史版本](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3)。
> 目前本项目的开发人员有限,大部分工作由 [Artiprocher](https://github.com/Artiprocher) 和 [mi804](https://github.com/mi804) 负责,因此新功能的开发进展会比较缓慢,issue 的回复和解决速度有限,我们对此感到非常抱歉,请各位开发者理解。
- **2026年1月19日** 新增对 [openmoss/MOVA-720p](https://modelscope.cn/models/openmoss/MOVA-720p) 和 [openmoss/MOVA-360p](https://modelscope.cn/models/openmoss/MOVA-360p) 模型的支持,包括完整的训练和推理功能。[文档](/docs/zh/Model_Details/Wan.md)和[示例代码](/examples/mova/)现已可用。
- **2026年3月12日** 我们新增了 [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) 音视频生成模型的支持,模型支持的功能包括文生音视频、图生音视频、IC-LoRA控制、音频生视频、音视频局部Inpainting,框架支持完整的推理和训练功能。详细信息请参考 [文档](/docs/zh/Model_Details/LTX-2.md) 和 [示例代码](/examples/ltx2/)。
- **2026年3月3日** 我们发布了 [DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2) 模型,这是 Qwen-Image-Layered-Control 的更新版本。除了原本就支持的文本引导功能,新增了画笔控制的图层拆分能力。
- **2026年3月2日** 新增对[Anima](https://modelscope.cn/models/circlestone-labs/Anima)的支持,详见[文档](docs/zh/Model_Details/Anima.md)。这是一个有趣的动漫风格图像生成模型,我们期待其后续的模型更新。
<details>
<summary>更多</summary>
- **2026年2月26日** 新增对[LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2)音视频生成模型全量微调与LoRA训练支持,详见[文档](docs/zh/Model_Details/LTX-2.md)。
- **2026年2月10日** 新增对[LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2)音视频生成模型的推理支持,详见[文档](docs/zh/Model_Details/LTX-2.md),后续将推进模型训练的支持。
- **2026年2月2日** Research Tutorial 的第一篇文档上线,带你从零开始训练一个 0.1B 的小型文生图模型,详见[文档](/docs/zh/Research_Tutorial/train_from_scratch.md)、[模型](https://modelscope.cn/models/DiffSynth-Studio/AAAMyModel),我们希望 DiffSynth-Studio 能够成为一个更强大的 Diffusion 模型训练框架。
- **2026年1月27日** [Z-Image](https://modelscope.cn/models/Tongyi-MAI/Z-Image) 发布,我们的 [Z-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-i2L) 模型同步发布,在[魔搭创空间](https://modelscope.cn/studios/DiffSynth-Studio/Z-Image-i2L)可直接体验,详见[文档](/docs/zh/Model_Details/Z-Image.md)。
- **2026年1月19日** 新增对 [FLUX.2-klein-4B](https://modelscope.cn/models/black-forest-labs/FLUX.2-klein-4B) 和 [FLUX.2-klein-9B](https://modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B) 模型的支持,包括完整的训练和推理功能。[文档](/docs/zh/Model_Details/FLUX2.md)和[示例代码](/examples/flux2/)现已可用。
- **2026年1月12日** 我们训练并开源了一个文本引导的图层拆分模型([模型链接](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control)),这一模型输入一张图与一段文本描述,模型会将图像中与文本描述相关的图层拆分出来。更多细节请阅读我们的 blog([中文版](https://modelscope.cn/learn/4938)、[英文版](https://huggingface.co/blog/kelseye/qwen-image-layered-control))。
- **2025年12月24日** 我们基于 Qwen-Image-Edit-2511 训练了一个 In-Context Editing LoRA 模型([模型链接](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-2511-ICEdit-LoRA)),这个模型可以输入三张图:图A、图B、图C,模型会自行分析图A到图B的变化,并将这样的变化应用到图C,生成图D。更多细节请阅读我们的 blog([中文版](https://mp.weixin.qq.com/s/41aEiN3lXKGCJs1-we4Q2g)、[英文版](https://huggingface.co/blog/kelseye/qwen-image-edit-2511-icedit-lora))。
- **2025年12月9日** 我们基于 DiffSynth-Studio 2.0 训练了一个疯狂的模型:[Qwen-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-i2L)(Image to LoRA)。这一模型以图像为输入,以 LoRA 为输出。尽管这个版本的模型在泛化能力、细节保持能力等方面还有很大改进空间,我们将这些模型开源,以启发更多创新性的研究工作。更多细节,请参考我们的 [blog](https://huggingface.co/blog/kelseye/qwen-image-i2l)。
- **2025年12月4日** DiffSynth-Studio 2.0 发布!众多新功能上线
- [文档](/docs/zh/README.md)上线:我们的文档还在持续优化更新中
- [显存管理](/docs/zh/Pipeline_Usage/VRAM_management.md)模块升级,支持 Layer 级别的 Disk Offload,同时释放内存与显存
- 新模型支持
- Z-Image Turbo: [模型](https://www.modelscope.ai/models/Tongyi-MAI/Z-Image-Turbo)、[文档](/docs/zh/Model_Details/Z-Image.md)、[代码](/examples/z_image/)
- FLUX.2-dev: [模型](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev)、[文档](/docs/zh/Model_Details/FLUX2.md)、[代码](/examples/flux2/)
- 训练框架升级
- [拆分训练](/docs/zh/Training/Split_Training.md):支持自动化地将训练过程拆分为数据处理和训练两阶段(即使训练的是 ControlNet 或其他任意模型),在数据处理阶段进行文本编码、VAE 编码等不需要梯度回传的计算,在训练阶段处理其他计算。速度更快,显存需求更少。
- [差分 LoRA 训练](/docs/zh/Training/Differential_LoRA.md):这是我们曾在 [ArtAug](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1) 中使用的训练技术,目前已可用于任意模型的 LoRA 训练。
- [FP8 训练](/docs/zh/Training/FP8_Precision.md):FP8 在训练中支持应用到任意非训练模型,即梯度关闭或者梯度仅影响 LoRA 权重的模型。
- **2025年11月4日** 支持了 [ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B) 模型,该模型基于 Wan 2.1 训练,支持根据参考视频生成相应的动作。
- **2025年10月30日** 支持了 [meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video) 模型,该模型支持文生视频、图生视频、视频续写。这个模型在本项目中沿用 Wan 的框架进行推理和训练。
- **2025年10月27日** 支持了 [krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video) 模型,Wan 模型生态再添一员。
- **2025年9月23日** [DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster) 发布!本模型由我们与淘天体验设计团队联合研发并开源。模型基于 Qwen-Image 构建,专为电商海报场景设计,支持精确的分区布局控制。 请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py)。
- **2025年9月9日** 我们的训练框架支持了多种训练模式,目前已适配 Qwen-Image,除标准 SFT 训练模式外,已支持 Direct Distill,请参考[我们的示例代码](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh)。这项功能是实验性的,我们将会继续完善已支持更全面的模型训练功能。
- **2025年8月28日** 我们支持了Wan2.2-S2V,一个音频驱动的电影级视频生成模型。请参见[./examples/wanvideo/](./examples/wanvideo/)。
- **2025年8月21日** [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2) 发布!相比于 V1 版本,训练数据集变为 [Qwen-Image-Self-Generated-Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Qwen-Image-Self-Generated-Dataset),因此,生成的图像更符合 Qwen-Image 本身的图像分布和风格。 请参考[我们的示例代码](./examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py)。
- **2025年8月21日** 我们开源了 [DiffSynth-Studio/Qwen-Image-In-Context-Control-Union](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-In-Context-Control-Union) 结构控制 LoRA 模型,采用 In Context 的技术路线,支持多种类别的结构控制条件,包括 canny, depth, lineart, softedge, normal, openpose。 请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-In-Context-Control-Union.py)。
- **2025年8月20日** 我们开源了 [DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix) 模型,提升了 Qwen-Image-Edit 对低分辨率图像输入的编辑效果。请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py)
- **2025年8月19日** 🔥 Qwen-Image-Edit 开源,欢迎图像编辑模型新成员!
- **2025年8月18日** 我们训练并开源了 Qwen-Image 的图像重绘 ControlNet 模型 [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint),模型结构采用了轻量化的设计,请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py)。
- **2025年8月15日** 我们开源了 [Qwen-Image-Self-Generated-Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Qwen-Image-Self-Generated-Dataset) 数据集。这是一个使用 Qwen-Image 模型生成的图像数据集,共包含 160,000 张`1024 x 1024`图像。它包括通用、英文文本渲染和中文文本渲染子集。我们为每张图像提供了图像描述、实体和结构控制图像的标注。开发者可以使用这个数据集来训练 Qwen-Image 模型的 ControlNet 和 EliGen 等模型,我们旨在通过开源推动技术发展!
- **2025年8月13日** 我们训练并开源了 Qwen-Image 的 ControlNet 模型 [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth),模型结构采用了轻量化的设计,请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)。
- **2025年8月12日** 我们训练并开源了 Qwen-Image 的 ControlNet 模型 [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny),模型结构采用了轻量化的设计,请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)。
- **2025年8月11日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA),沿用了与 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) 相同的训练流程,但模型结构修改为了 LoRA,因此能够更好地与其他开源生态模型兼容。
- **2025年8月7日** 我们开源了 Qwen-Image 的实体控制 LoRA 模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen 能够实现实体级可控的文生图。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。
- **2025年8月5日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full),实现了约 5 倍加速。
- **2025年8月4日** 🔥 Qwen-Image 开源,欢迎图像生成模型家族新成员!
- **2025年8月1日** [FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev) 开源,这是一个专注于美学摄影的文生图模型。我们第一时间提供了全方位支持,包括低显存逐层 offload、LoRA 训练、全量训练。详细信息请参考 [./examples/flux/](./examples/flux/)。
- **2025年7月28日** Wan 2.2 开源,我们第一时间提供了全方位支持,包括低显存逐层 offload、FP8 量化、序列并行、LoRA 训练、全量训练。详细信息请参考 [./examples/wanvideo/](./examples/wanvideo/)。
- **2025年7月11日** 我们提出 Nexus-Gen,一个将大语言模型(LLM)的语言推理能力与扩散模型的图像生成能力相结合的统一框架。该框架支持无缝的图像理解、生成和编辑任务。
- 论文: [Nexus-Gen: Unified Image Understanding, Generation, and Editing via Prefilled Autoregression in Shared Embedding Space](https://arxiv.org/pdf/2504.21356)
- Github 仓库: https://github.com/modelscope/Nexus-Gen
- 模型: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Nexus-GenV2), [HuggingFace](https://huggingface.co/modelscope/Nexus-GenV2)
- 训练数据集: [ModelScope Dataset](https://www.modelscope.cn/datasets/DiffSynth-Studio/Nexus-Gen-Training-Dataset)
- 在线体验: [ModelScope Nexus-Gen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/Nexus-Gen)
- **2025年6月15日** ModelScope 官方评测框架 [EvalScope](https://github.com/modelscope/evalscope) 现已支持文生图生成评测。请参考[最佳实践](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/t2i_eval.html)指南进行尝试。
- **2025年3月25日** 我们的新开源项目 [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine) 现已开源!专注于稳定的模型部署,面向工业界,提供更好的工程支持、更高的计算性能和更稳定的功能。
- **2025年3月31日** 我们支持 InfiniteYou,一种用于 FLUX 的人脸特征保留方法。更多细节请参考 [./examples/InfiniteYou/](./examples/InfiniteYou/)。
- **2025年3月13日** 我们支持 HunyuanVideo-I2V,即腾讯开源的 HunyuanVideo 的图像到视频生成版本。更多细节请参考 [./examples/HunyuanVideo/](./examples/HunyuanVideo/)。
- **2025年2月25日** 我们支持 Wan-Video,这是阿里巴巴开源的一系列最先进的视频合成模型。详见 [./examples/wanvideo/](./examples/wanvideo/)。
- **2025年2月17日** 我们支持 [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)!先进的视频合成模型!详见 [./examples/stepvideo](./examples/stepvideo/)。
- **2024年12月31日** 我们提出 EliGen,一种用于精确实体级别控制的文本到图像生成的新框架,并辅以修复融合管道,将其能力扩展到图像修复任务。EliGen 可以无缝集成现有的社区模型,如 IP-Adapter 和 In-Context LoRA,提升其通用性。更多详情,请见 [./examples/EntityControl](./examples/EntityControl/)。
- 论文: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
- 模型: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen), [HuggingFace](https://huggingface.co/modelscope/EliGen)
- 在线体验: [ModelScope EliGen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/EliGen)
- 训练数据集: [EliGen Train Set](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)
- **2024年12月19日** 我们为 HunyuanVideo 实现了高级显存管理,使得在 24GB 显存下可以生成分辨率为 129x720x1280 的视频,或在仅 6GB 显存下生成分辨率为 129x512x384 的视频。更多细节请参考 [./examples/HunyuanVideo/](./examples/HunyuanVideo/)。
- **2024年12月18日** 我们提出 ArtAug,一种通过合成-理解交互来改进文生图模型的方法。我们以 LoRA 格式为 FLUX.1-dev 训练了一个 ArtAug 增强模块。该模型将 Qwen2-VL-72B 的美学理解融入 FLUX.1-dev,从而提升了生成图像的质量。
- 论文: https://arxiv.org/abs/2412.12888
- 示例: https://github.com/modelscope/DiffSynth-Studio/tree/main/examples/ArtAug
- 模型: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1), [HuggingFace](https://huggingface.co/ECNU-CILab/ArtAug-lora-FLUX.1dev-v1)
- 演示: [ModelScope](https://modelscope.cn/aigc/imageGeneration?tab=advanced&versionId=7228&modelType=LoRA&sdVersion=FLUX_1&modelUrl=modelscope%3A%2F%2FDiffSynth-Studio%2FArtAug-lora-FLUX.1dev-v1%3Frevision%3Dv1.0), HuggingFace (即将上线)
- **2024年10月25日** 我们提供了广泛的 FLUX ControlNet 支持。该项目支持许多不同的 ControlNet 模型,并且可以自由组合,即使它们的结构不同。此外,ControlNet 模型兼容高分辨率优化和分区控制技术,能够实现非常强大的可控图像生成。详见 [`./examples/ControlNet/`](./examples/ControlNet/)。
- **2024年10月8日** 我们发布了基于 CogVideoX-5B 和 ExVideo 的扩展 LoRA。您可以从 [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1) 或 [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1) 下载此模型。
- **2024年8月22日** 本项目现已支持 CogVideoX-5B。详见 [此处](/examples/video_synthesis/)。我们为这个文生视频模型提供了几个有趣的功能,包括:
- 文本到视频
- 视频编辑
- 自我超分
- 视频插帧
- **2024年8月22日** 我们实现了一个有趣的画笔功能,支持所有文生图模型。现在,您可以在 AI 的辅助下使用画笔创作惊艳的图像了!
- 在我们的 [WebUI](#usage-in-webui) 中使用它。
- **2024年8月21日** DiffSynth-Studio 现已支持 FLUX。
- 启用 CFG 和高分辨率修复以提升视觉质量。详见 [此处](/examples/image_synthesis/README.md)
- LoRA、ControlNet 和其他附加模型将很快推出。
- **2024年6月21日** 我们提出 ExVideo,一种旨在增强视频生成模型能力的后训练微调技术。我们将 Stable Video Diffusion 进行了扩展,实现了长达 128 帧的长视频生成。
- [项目页面](https://ecnu-cilab.github.io/ExVideoProjectPage/)
- 源代码已在此仓库中发布。详见 [`examples/ExVideo`](./examples/ExVideo/)。
- 模型已发布于 [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) 和 [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1)。
- 技术报告已发布于 [arXiv](https://arxiv.org/abs/2406.14130)。
- 您可以在此 [演示](https://huggingface.co/spaces/modelscope/ExVideo-SVD-128f-v1) 中试用 ExVideo!
- **2024年6月13日** DiffSynth Studio 已迁移至 ModelScope。开发团队也从“我”转变为“我们”。当然,我仍会参与后续的开发和维护工作。
- **2024年1月29日** 我们提出 Diffutoon,这是一个出色的卡通着色解决方案。
- [项目页面](https://ecnu-cilab.github.io/DiffutoonProjectPage/)
- 源代码已在此项目中发布。
- 技术报告(IJCAI 2024)已发布于 [arXiv](https://arxiv.org/abs/2401.16224)。
- **2023年12月8日** 我们决定启动一个新项目,旨在释放扩散模型的潜力,尤其是在视频合成方面。该项目的开发工作正式开始。
- **2023年11月15日** 我们提出 FastBlend,一种强大的视频去闪烁算法。
- sd-webui 扩展已发布于 [GitHub](https://github.com/Artiprocher/sd-webui-fastblend)。
- 演示视频已在 Bilibili 上展示,包含三个任务:
- [视频去闪烁](https://www.bilibili.com/video/BV1d94y1W7PE)
- [视频插帧](https://www.bilibili.com/video/BV1Lw411m71p)
- [图像驱动的视频渲染](https://www.bilibili.com/video/BV1RB4y1Z7LF)
- 技术报告已发布于 [arXiv](https://arxiv.org/abs/2311.09265)。
- 其他用户开发的非官方 ComfyUI 扩展已发布于 [GitHub](https://github.com/AInseven/ComfyUI-fastblend)。
- **2023年10月1日** 我们发布了该项目的早期版本,名为 FastSDXL。这是构建一个扩散引擎的初步尝试。
- 源代码已发布于 [GitHub](https://github.com/Artiprocher/FastSDXL)。
- FastSDXL 包含一个可训练的 OLSS 调度器,以提高效率。
- OLSS 的原始仓库位于 [此处](https://github.com/alibaba/EasyNLP/tree/master/diffusion/olss_scheduler)。
- 技术报告(CIKM 2023)已发布于 [arXiv](https://arxiv.org/abs/2305.14677)。
- 演示视频已发布于 [Bilibili](https://www.bilibili.com/video/BV1w8411y7uj)。
- 由于 OLSS 需要额外训练,我们未在本项目中实现它。
- **2023年8月29日** 我们提出 DiffSynth,一个视频合成框架。
- [项目页面](https://ecnu-cilab.github.io/DiffSynth.github.io/)。
- 源代码已发布在 [EasyNLP](https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth)。
- 技术报告(ECML PKDD 2024)已发布于 [arXiv](https://arxiv.org/abs/2308.03463)。
</details>
## 安装
从源码安装(推荐):
```
git clone https://github.com/modelscope/DiffSynth-Studio.git
cd DiffSynth-Studio
pip install -e .
```
更多安装方式,以及非 NVIDIA GPU 的安装,请参考[安装文档](/docs/zh/Pipeline_Usage/Setup.md)。
</details>
## 基础框架
DiffSynth-Studio 为主流 Diffusion 模型(包括 FLUX、Wan 等)重新设计了推理和训练流水线,能够实现高效的显存管理、灵活的模型训练。
<details>
<summary>环境变量配置</summary>
> 在进行模型推理和训练前,可通过[环境变量](/docs/zh/Pipeline_Usage/Environment_Variables.md)配置模型下载源等。
>
> 本项目默认从魔搭社区下载模型。对于非中国区域的用户,可以通过以下配置从魔搭社区的国际站下载模型:
>
> ```python
> import os
> os.environ["MODELSCOPE_DOMAIN"] = "www.modelscope.ai"
> ```
>
> 如需从其他站点下载,请修改[环境变量 DIFFSYNTH_DOWNLOAD_SOURCE](/docs/zh/Pipeline_Usage/Environment_Variables.md#diffsynth_download_source)。
</details>
### 图像生成模型

#### Z-Image:[/docs/zh/Model_Details/Z-Image.md](/docs/zh/Model_Details/Z-Image.md)
<details>
<summary>快速开始</summary>
运行以下代码可以快速加载 [Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo) 模型并进行推理。FP8 精度量化会导致明显的图像质量劣化,因此不建议在 Z-Image Turbo 模型上开启任何量化,仅建议开启 CPU Offload,最低 8G 显存即可运行。
```python
from diffsynth.pipelines.z_image import ZImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": torch.bfloat16,
"offload_device": "cpu",
"onload_dtype": torch.bfloat16,
"onload_device": "cpu",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = ZImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="transformer/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="tokenizer/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
image = pipe(prompt=prompt, seed=42, rand_device="cuda")
image.save("image.jpg")
```
</details>
<details>
<summary>示例代码</summary>
Z-Image 的示例代码位于:[/examples/z_image/](/examples/z_image/)
|模型 ID|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
|-|-|-|-|-|-|-|
|[Tongyi-MAI/Z-Image](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image)|[code](/examples/z_image/model_inference/Z-Image.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image.py)|[code](/examples/z_image/model_training/full/Z-Image.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image.py)|[code](/examples/z_image/model_training/lora/Z-Image.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image.py)|
|[DiffSynth-Studio/Z-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-i2L)|[code](/examples/z_image/model_inference/Z-Image-i2L.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-i2L.py)|-|-|-|-|
|[Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo)|[code](/examples/z_image/model_inference/Z-Image-Turbo.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo.py)|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1.py)|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py)|
|[PAI/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps](https://www.modelscope.cn/models/PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1)|[code](/examples/z_image/model_inference/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_full/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|[code](/examples/z_image/model_training/lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh)|[code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py)|
</details>
#### FLUX.2: [/docs/zh/Model_Details/FLUX2.md](/docs/zh/Model_Details/FLUX2.md)
<details>
<summary>快速开始</summary>
运行以下代码可以快速加载 [black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 10G 显存即可运行。
```python
from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": torch.float8_e4m3fn,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e4m3fn,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = Flux2ImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="transformer/*.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
],
tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="tokenizer/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "High resolution. A dreamy underwater portrait of a serene young woman in a flowing blue dress. Her hair floats softly around her face, strands delicately suspended in the water. Clear, shimmering light filters through, casting gentle highlights, while tiny bubbles rise around her. Her expression is calm, her features finely detailed—creating a tranquil, ethereal scene."
image = pipe(prompt, seed=42, rand_device="cuda", num_inference_steps=50)
image.save("image.jpg")
```
</details>
<details>
<summary>示例代码</summary>
FLUX.2 的示例代码位于:[/examples/flux2/](/examples/flux2/)
|模型 ID|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
|-|-|-|-|-|-|-|
|[black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev)|[code](/examples/flux2/model_inference/FLUX.2-dev.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-dev.py)|-|-|[code](/examples/flux2/model_training/lora/FLUX.2-dev.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-dev.py)|
|[black-forest-labs/FLUX.2-klein-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-4B.py)|
|[black-forest-labs/FLUX.2-klein-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-9B.py)|
|[black-forest-labs/FLUX.2-klein-base-4B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-4B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-4B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-4B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-4B.py)|
|[black-forest-labs/FLUX.2-klein-base-9B](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-klein-base-9B)|[code](/examples/flux2/model_inference/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_inference_low_vram/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/full/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_full/FLUX.2-klein-base-9B.py)|[code](/examples/flux2/model_training/lora/FLUX.2-klein-base-9B.sh)|[code](/examples/flux2/model_training/validate_lora/FLUX.2-klein-base-9B.py)|
</details>
#### Anima: [/docs/zh/Model_Details/Anima.md](/docs/zh/Model_Details/Anima.md)
<details>
<summary>快速开始</summary>
运行以下代码可以快速加载 [circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 8G 显存即可运行。
```python
from diffsynth.pipelines.anima_image import AnimaImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": "disk",
"onload_device": "disk",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = AnimaImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/diffusion_models/anima-preview.safetensors", **vram_config),
ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/text_encoders/qwen_3_06b_base.safetensors", **vram_config),
ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/vae/qwen_image_vae.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
tokenizer_t5xxl_config=ModelConfig(model_id="stabilityai/stable-diffusion-3.5-large", origin_file_pattern="tokenizer_3/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "Masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait."
negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
image = pipe(prompt, seed=0, num_inference_steps=50)
image.save("image.jpg")
```
</details>
<details>
<summary>示例代码</summary>
Anima 的示例代码位于:[/examples/anima/](/examples/anima/)
|模型 ID|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
|-|-|-|-|-|-|-|
|[circlestone-labs/Anima](https://www.modelscope.cn/models/circlestone-labs/Anima)|[code](/examples/anima/model_inference/anima-preview.py)|[code](/examples/anima/model_inference_low_vram/anima-preview.py)|[code](/examples/anima/model_training/full/anima-preview.sh)|[code](/examples/anima/model_training/validate_full/anima-preview.py)|[code](/examples/anima/model_training/lora/anima-preview.sh)|[code](/examples/anima/model_training/validate_lora/anima-preview.py)|
</details>
#### Qwen-Image: [/docs/zh/Model_Details/Qwen-Image.md](/docs/zh/Model_Details/Qwen-Image.md)
<details>
<summary>快速开始</summary>
运行以下代码可以快速加载 [Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 8G 显存即可运行。
```python
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": "disk",
"offload_device": "disk",
"onload_dtype": torch.float8_e4m3fn,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e4m3fn,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = QwenImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config),
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
)
prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。"
image = pipe(prompt, seed=0, num_inference_steps=40)
image.save("image.jpg")
```
</details>
<details>
<summary>模型血缘</summary>
```mermaid
graph LR;
Qwen/Qwen-Image-->Qwen/Qwen-Image-Edit;
Qwen/Qwen-Image-Edit-->Qwen/Qwen-Image-Edit-2509;
Qwen/Qwen-Image-->EliGen-Series;
EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen;
DiffSynth-Studio/Qwen-Image-EliGen-->DiffSynth-Studio/Qwen-Image-EliGen-V2;
EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen-Poster;
Qwen/Qwen-Image-->Distill-Series;
Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-Full;
Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-LoRA;
Qwen/Qwen-Image-->ControlNet-Series;
ControlNet-Series-->Blockwise-ControlNet-Series;
Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny;
Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth;
Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint;
ControlNet-Series-->DiffSynth-Studio/Qwen-Image-In-Context-Control-Union;
Qwen/Qwen-Image-->DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix;
```
</details>
<details>
<summary>示例代码</summary>
Qwen-Image 的示例代码位于:[/examples/qwen_image/](/examples/qwen_image/)
|模型 ID|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
|-|-|-|-|-|-|-|
|[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](/examples/qwen_image/model_inference/Qwen-Image.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image.py)|
|[Qwen/Qwen-Image-2512](https://www.modelscope.cn/models/Qwen/Qwen-Image-2512)|[code](/examples/qwen_image/model_inference/Qwen-Image-2512.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-2512.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-2512.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-2512.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-2512.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-2512.py)|
|[Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit.py)|
|[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py)|
|[Qwen/Qwen-Image-Edit-2511](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2511)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2511.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2511.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2511.py)|
|[FireRedTeam/FireRed-Image-Edit-1.0](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.0)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.0.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.0.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.0.py)|
|[FireRedTeam/FireRed-Image-Edit-1.1](https://www.modelscope.cn/models/FireRedTeam/FireRed-Image-Edit-1.1)|[code](/examples/qwen_image/model_inference/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_inference_low_vram/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/full/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_full/FireRed-Image-Edit-1.1.py)|[code](/examples/qwen_image/model_training/lora/FireRed-Image-Edit-1.1.sh)|[code](/examples/qwen_image/model_training/validate_lora/FireRed-Image-Edit-1.1.py)|
|[lightx2v/Qwen-Image-Edit-2511-Lightning](https://modelscope.cn/models/lightx2v/Qwen-Image-Edit-2511-Lightning)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2511-Lightning.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2511-Lightning.py)|-|-|-|-|
|[Qwen/Qwen-Image-Layered](https://www.modelscope.cn/models/Qwen/Qwen-Image-Layered)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered.py)|
|[DiffSynth-Studio/Qwen-Image-Layered-Control](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Layered-Control.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control.py)|
|[DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-Layered-Control-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Layered-Control-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Layered-Control-V2.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Layered-Control-V2.py)|
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|[DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|[DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster)|[code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-Poster.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen-Poster.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen-Poster.py)|
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](/examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](/examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-LoRA.py)|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py)|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py)|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint)|[code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|[code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Inpaint.sh)|[code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|[code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Inpaint.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Inpaint.py)|
|[DiffSynth-Studio/Qwen-Image-In-Context-Control-Union](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-In-Context-Control-Union)|[code](/examples/qwen_image/model_inference/Qwen-Image-In-Context-Control-Union.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-In-Context-Control-Union.py)|-|-|[code](/examples/qwen_image/model_training/lora/Qwen-Image-In-Context-Control-Union.sh)|[code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-In-Context-Control-Union.py)|
|[DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix)|[code](/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py)|-|-|-|-|
|[DiffSynth-Studio/Qwen-Image-i2L](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-i2L)|[code](/examples/qwen_image/model_inference/Qwen-Image-i2L.py)|[code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-i2L.py)|-|-|-|-|
</details>
#### FLUX.1: [/docs/zh/Model_Details/FLUX.md](/docs/zh/Model_Details/FLUX.md)
<details>
<summary>快速开始</summary>
运行以下代码可以快速加载 [black-forest-labs/FLUX.1-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-dev) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 8G 显存即可运行。
```python
import torch
from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
vram_config = {
"offload_dtype": torch.float8_e4m3fn,
"offload_device": "cpu",
"onload_dtype": torch.float8_e4m3fn,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e4m3fn,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = FluxImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors", **vram_config),
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", **vram_config),
],
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 1,
)
prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
image = pipe(prompt=prompt, seed=0)
image.save("image.jpg")
```
</details>
<details>
<summary>模型血缘</summary>
```mermaid
graph LR;
FLUX.1-Series-->black-forest-labs/FLUX.1-dev;
FLUX.1-Series-->black-forest-labs/FLUX.1-Krea-dev;
FLUX.1-Series-->black-forest-labs/FLUX.1-Kontext-dev;
black-forest-labs/FLUX.1-dev-->FLUX.1-dev-ControlNet-Series;
FLUX.1-dev-ControlNet-Series-->alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta;
FLUX.1-dev-ControlNet-Series-->InstantX/FLUX.1-dev-Controlnet-Union-alpha;
FLUX.1-dev-ControlNet-Series-->jasperai/Flux.1-dev-Controlnet-Upscaler;
black-forest-labs/FLUX.1-dev-->InstantX/FLUX.1-dev-IP-Adapter;
black-forest-labs/FLUX.1-dev-->ByteDance/InfiniteYou;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Eligen;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev;
black-forest-labs/FLUX.1-dev-->ostris/Flex.2-preview;
black-forest-labs/FLUX.1-dev-->stepfun-ai/Step1X-Edit;
Qwen/Qwen2.5-VL-7B-Instruct-->stepfun-ai/Step1X-Edit;
black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Nexus-GenV2;
Qwen/Qwen2.5-VL-7B-Instruct-->DiffSynth-Studio/Nexus-GenV2;
```
</details>
<details>
<summary>示例代码</summary>
FLUX.1 的示例代码位于:[/examples/flux/](/examples/flux/)
|模型 ID|额外参数|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
|-|-|-|-|-|-|-|-|
|[black-forest-labs/FLUX.1-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-dev)||[code](/examples/flux/model_inference/FLUX.1-dev.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev.py)|[code](/examples/flux/model_training/full/FLUX.1-dev.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev.py)|
|[black-forest-labs/FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev)||[code](/examples/flux/model_inference/FLUX.1-Krea-dev.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-Krea-dev.py)|[code](/examples/flux/model_training/full/FLUX.1-Krea-dev.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-Krea-dev.py)|[code](/examples/flux/model_training/lora/FLUX.1-Krea-dev.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-Krea-dev.py)|
|[black-forest-labs/FLUX.1-Kontext-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Kontext-dev)|`kontext_images`|[code](/examples/flux/model_inference/FLUX.1-Kontext-dev.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py)|[code](/examples/flux/model_training/full/FLUX.1-Kontext-dev.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-Kontext-dev.py)|[code](/examples/flux/model_training/lora/FLUX.1-Kontext-dev.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-Kontext-dev.py)|
|[alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta](https://www.modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta)|`controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Inpainting-Beta.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Inpainting-Beta.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Inpainting-Beta.py)|
|[InstantX/FLUX.1-dev-Controlnet-Union-alpha](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha)|`controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Union-alpha.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Union-alpha.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Union-alpha.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Union-alpha.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Union-alpha.py)|
|[jasperai/Flux.1-dev-Controlnet-Upscaler](https://www.modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler)|`controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Upscaler.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Upscaler.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Upscaler.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Upscaler.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Upscaler.py)|
|[InstantX/FLUX.1-dev-IP-Adapter](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-IP-Adapter)|`ipadapter_images`, `ipadapter_scale`|[code](/examples/flux/model_inference/FLUX.1-dev-IP-Adapter.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-IP-Adapter.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-IP-Adapter.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-IP-Adapter.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-IP-Adapter.py)|
|[ByteDance/InfiniteYou](https://www.modelscope.cn/models/ByteDance/InfiniteYou)|`infinityou_id_image`, `infinityou_guidance`, `controlnet_inputs`|[code](/examples/flux/model_inference/FLUX.1-dev-InfiniteYou.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-InfiniteYou.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-InfiniteYou.py)|[code](/examples/flux/model_training/lora/FLUX.1-dev-InfiniteYou.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-InfiniteYou.py)|
|[DiffSynth-Studio/Eligen](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)|`eligen_entity_prompts`, `eligen_entity_masks`, `eligen_enable_on_negative`, `eligen_enable_inpaint`|[code](/examples/flux/model_inference/FLUX.1-dev-EliGen.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-EliGen.py)|-|-|[code](/examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh)|[code](/examples/flux/model_training/validate_lora/FLUX.1-dev-EliGen.py)|
|[DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev](https://www.modelscope.cn/models/DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev)|`lora_encoder_inputs`, `lora_encoder_scale`|[code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Encoder.py)|[code](/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRA-Encoder.py)|[code](/examples/flux/model_training/full/FLUX.1-dev-LoRA-Encoder.sh)|[code](/examples/flux/model_training/validate_full/FLUX.1-dev-LoRA-Encoder.py)|-|-|
|[DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev](https://modelscope.cn/models/DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev)||[code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py)|-|-|-|-|-|
|[stepfun-ai/Step1X-Edit](https://www.modelscope.cn/models/stepfun-ai/Step1X-Edit)|`step1x_reference_image`|[code](/examples/flux/model_inference/Step1X-Edit.py)|[code](/examples/flux/model_inference_low_vram/Step1X-Edit.py)|[code](/examples/flux/model_training/full/Step1X-Edit.sh)|[code](/examples/flux/model_training/validate_full/Step1X-Edit.py)|[code](/examples/flux/model_training/lora/Step1X-Edit.sh)|[code](/examples/flux/model_training/validate_lora/Step1X-Edit.py)|
|[ostris/Flex.2-preview](https://www.modelscope.cn/models/ostris/Flex.2-preview)|`flex_inpaint_image`, `flex_inpaint_mask`, `flex_control_image`, `flex_control_strength`, `flex_control_stop`|[code](/examples/flux/model_inference/FLEX.2-preview.py)|[code](/examples/flux/model_inference_low_vram/FLEX.2-preview.py)|[code](/examples/flux/model_training/full/FLEX.2-preview.sh)|[code](/examples/flux/model_training/validate_full/FLEX.2-preview.py)|[code](/examples/flux/model_training/lora/FLEX.2-preview.sh)|[code](/examples/flux/model_training/validate_lora/FLEX.2-preview.py)|
|[DiffSynth-Studio/Nexus-GenV2](https://www.modelscope.cn/models/DiffSynth-Studio/Nexus-GenV2)|`nexus_gen_reference_image`|[code](/examples/flux/model_inference/Nexus-Gen-Editing.py)|[code](/examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py)|[code](/examples/flux/model_training/full/Nexus-Gen.sh)|[code](/examples/flux/model_training/validate_full/Nexus-Gen.py)|[code](/examples/flux/model_training/lora/Nexus-Gen.sh)|[code](/examples/flux/model_training/validate_lora/Nexus-Gen.py)|
</details>
### 视频生成模型
https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
#### LTX-2: [/docs/zh/Model_Details/LTX-2.md](/docs/zh/Model_Details/LTX-2.md)
<details>
<summary>快速开始</summary>
运行以下代码可以快速加载 [Lightricks/LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 8GB 显存即可运行。
```python
import torch
from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
vram_config = {
"offload_dtype": torch.float8_e5m2,
"offload_device": "cpu",
"onload_dtype": torch.float8_e5m2,
"onload_device": "cpu",
"preparing_dtype": torch.float8_e5m2,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
"""
Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2
Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage
For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors"))
and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported.
We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules,
and avoid redundant memory usage when users only want to use part of the model.
"""
# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading
pipe = LTX2AudioVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors", **vram_config),
ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
vram_limit=torch.cuda.mem
gitextract_1d_bzl_w/ ├── .github/ │ └── workflows/ │ └── publish.yaml ├── .gitignore ├── LICENSE ├── README.md ├── README_zh.md ├── diffsynth/ │ ├── __init__.py │ ├── configs/ │ │ ├── __init__.py │ │ ├── model_configs.py │ │ └── vram_management_module_maps.py │ ├── core/ │ │ ├── __init__.py │ │ ├── attention/ │ │ │ ├── __init__.py │ │ │ └── attention.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── operators.py │ │ │ └── unified_dataset.py │ │ ├── device/ │ │ │ ├── __init__.py │ │ │ └── npu_compatible_device.py │ │ ├── gradient/ │ │ │ ├── __init__.py │ │ │ └── gradient_checkpoint.py │ │ ├── loader/ │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── file.py │ │ │ └── model.py │ │ ├── npu_patch/ │ │ │ └── npu_fused_operator.py │ │ └── vram/ │ │ ├── __init__.py │ │ ├── disk_map.py │ │ ├── initialization.py │ │ └── layers.py │ ├── diffusion/ │ │ ├── __init__.py │ │ ├── base_pipeline.py │ │ ├── flow_match.py │ │ ├── logger.py │ │ ├── loss.py │ │ ├── parsers.py │ │ ├── runner.py │ │ └── training_module.py │ ├── models/ │ │ ├── anima_dit.py │ │ ├── dinov3_image_encoder.py │ │ ├── flux2_dit.py │ │ ├── flux2_text_encoder.py │ │ ├── flux2_vae.py │ │ ├── flux_controlnet.py │ │ ├── flux_dit.py │ │ ├── flux_infiniteyou.py │ │ ├── flux_ipadapter.py │ │ ├── flux_lora_encoder.py │ │ ├── flux_lora_patcher.py │ │ ├── flux_text_encoder_clip.py │ │ ├── flux_text_encoder_t5.py │ │ ├── flux_vae.py │ │ ├── flux_value_control.py │ │ ├── general_modules.py │ │ ├── longcat_video_dit.py │ │ ├── ltx2_audio_vae.py │ │ ├── ltx2_common.py │ │ ├── ltx2_dit.py │ │ ├── ltx2_text_encoder.py │ │ ├── ltx2_upsampler.py │ │ ├── ltx2_video_vae.py │ │ ├── model_loader.py │ │ ├── mova_audio_dit.py │ │ ├── mova_audio_vae.py │ │ ├── mova_dual_tower_bridge.py │ │ ├── nexus_gen.py │ │ ├── nexus_gen_ar_model.py │ │ ├── nexus_gen_projector.py │ │ ├── qwen_image_controlnet.py │ │ ├── qwen_image_dit.py │ │ ├── qwen_image_image2lora.py │ │ ├── qwen_image_text_encoder.py │ │ ├── qwen_image_vae.py │ │ ├── sd_text_encoder.py │ │ ├── siglip2_image_encoder.py │ │ ├── step1x_connector.py │ │ ├── step1x_text_encoder.py │ │ ├── wan_video_animate_adapter.py │ │ ├── wan_video_camera_controller.py │ │ ├── wan_video_dit.py │ │ ├── wan_video_dit_s2v.py │ │ ├── wan_video_image_encoder.py │ │ ├── wan_video_mot.py │ │ ├── wan_video_motion_controller.py │ │ ├── wan_video_text_encoder.py │ │ ├── wan_video_vace.py │ │ ├── wan_video_vae.py │ │ ├── wantodance.py │ │ ├── wav2vec.py │ │ ├── z_image_controlnet.py │ │ ├── z_image_dit.py │ │ ├── z_image_image2lora.py │ │ └── z_image_text_encoder.py │ ├── pipelines/ │ │ ├── anima_image.py │ │ ├── flux2_image.py │ │ ├── flux_image.py │ │ ├── ltx2_audio_video.py │ │ ├── mova_audio_video.py │ │ ├── qwen_image.py │ │ ├── wan_video.py │ │ └── z_image.py │ ├── utils/ │ │ ├── controlnet/ │ │ │ ├── __init__.py │ │ │ ├── annotator.py │ │ │ └── controlnet_input.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── audio.py │ │ │ ├── audio_video.py │ │ │ └── media_io_ltx2.py │ │ ├── lora/ │ │ │ ├── __init__.py │ │ │ ├── flux.py │ │ │ ├── general.py │ │ │ ├── merge.py │ │ │ └── reset_rank.py │ │ ├── ses/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── ses.py │ │ ├── state_dict_converters/ │ │ │ ├── __init__.py │ │ │ ├── anima_dit.py │ │ │ ├── flux2_text_encoder.py │ │ │ ├── flux_controlnet.py │ │ │ ├── flux_dit.py │ │ │ ├── flux_infiniteyou.py │ │ │ ├── flux_ipadapter.py │ │ │ ├── flux_text_encoder_clip.py │ │ │ ├── flux_text_encoder_t5.py │ │ │ ├── flux_vae.py │ │ │ ├── ltx2_audio_vae.py │ │ │ ├── ltx2_dit.py │ │ │ ├── ltx2_text_encoder.py │ │ │ ├── ltx2_video_vae.py │ │ │ ├── nexus_gen.py │ │ │ ├── nexus_gen_projector.py │ │ │ ├── qwen_image_text_encoder.py │ │ │ ├── step1x_connector.py │ │ │ ├── wan_video_animate_adapter.py │ │ │ ├── wan_video_dit.py │ │ │ ├── wan_video_image_encoder.py │ │ │ ├── wan_video_mot.py │ │ │ ├── wan_video_vace.py │ │ │ ├── wan_video_vae.py │ │ │ ├── wans2v_audio_encoder.py │ │ │ └── z_image_text_encoder.py │ │ └── xfuser/ │ │ ├── __init__.py │ │ └── xdit_context_parallel.py │ └── version.py ├── docs/ │ ├── en/ │ │ ├── .readthedocs.yaml │ │ ├── API_Reference/ │ │ │ └── core/ │ │ │ ├── attention.md │ │ │ ├── data.md │ │ │ ├── gradient.md │ │ │ ├── loader.md │ │ │ └── vram.md │ │ ├── Developer_Guide/ │ │ │ ├── Building_a_Pipeline.md │ │ │ ├── Enabling_VRAM_management.md │ │ │ ├── Integrating_Your_Model.md │ │ │ └── Training_Diffusion_Models.md │ │ ├── Makefile │ │ ├── Model_Details/ │ │ │ ├── Anima.md │ │ │ ├── FLUX.md │ │ │ ├── FLUX2.md │ │ │ ├── LTX-2.md │ │ │ ├── Overview.md │ │ │ ├── Qwen-Image.md │ │ │ ├── Wan.md │ │ │ └── Z-Image.md │ │ ├── Pipeline_Usage/ │ │ │ ├── Environment_Variables.md │ │ │ ├── GPU_support.md │ │ │ ├── Model_Inference.md │ │ │ ├── Model_Training.md │ │ │ ├── Setup.md │ │ │ └── VRAM_management.md │ │ ├── QA.md │ │ ├── README.md │ │ ├── Research_Tutorial/ │ │ │ ├── inference_time_scaling.ipynb │ │ │ ├── inference_time_scaling.md │ │ │ ├── train_from_scratch.md │ │ │ └── train_from_scratch.py │ │ ├── Training/ │ │ │ ├── Differential_LoRA.md │ │ │ ├── Direct_Distill.md │ │ │ ├── FP8_Precision.md │ │ │ ├── Split_Training.md │ │ │ ├── Supervised_Fine_Tuning.md │ │ │ └── Understanding_Diffusion_models.md │ │ ├── conf.py │ │ └── index.rst │ ├── requirements.txt │ └── zh/ │ ├── .readthedocs.yaml │ ├── API_Reference/ │ │ └── core/ │ │ ├── attention.md │ │ ├── data.md │ │ ├── gradient.md │ │ ├── loader.md │ │ └── vram.md │ ├── Developer_Guide/ │ │ ├── Building_a_Pipeline.md │ │ ├── Enabling_VRAM_management.md │ │ ├── Integrating_Your_Model.md │ │ └── Training_Diffusion_Models.md │ ├── Makefile │ ├── Model_Details/ │ │ ├── Anima.md │ │ ├── FLUX.md │ │ ├── FLUX2.md │ │ ├── LTX-2.md │ │ ├── Overview.md │ │ ├── Qwen-Image.md │ │ ├── Wan.md │ │ └── Z-Image.md │ ├── Pipeline_Usage/ │ │ ├── Environment_Variables.md │ │ ├── GPU_support.md │ │ ├── Model_Inference.md │ │ ├── Model_Training.md │ │ ├── Setup.md │ │ └── VRAM_management.md │ ├── QA.md │ ├── README.md │ ├── Research_Tutorial/ │ │ ├── inference_time_scaling.ipynb │ │ ├── inference_time_scaling.md │ │ ├── train_from_scratch.md │ │ └── train_from_scratch.py │ ├── Training/ │ │ ├── Differential_LoRA.md │ │ ├── Direct_Distill.md │ │ ├── FP8_Precision.md │ │ ├── Split_Training.md │ │ ├── Supervised_Fine_Tuning.md │ │ └── Understanding_Diffusion_models.md │ ├── conf.py │ └── index.rst ├── examples/ │ ├── anima/ │ │ ├── README.md │ │ ├── model_inference/ │ │ │ └── anima-preview.py │ │ ├── model_inference_low_vram/ │ │ │ └── anima-preview.py │ │ └── model_training/ │ │ ├── full/ │ │ │ └── anima-preview.sh │ │ ├── lora/ │ │ │ └── anima-preview.sh │ │ ├── train.py │ │ ├── validate_full/ │ │ │ └── anima-preview.py │ │ └── validate_lora/ │ │ └── anima-preview.py │ ├── dev_tools/ │ │ ├── fix_path.py │ │ └── unit_test.py │ ├── flux/ │ │ ├── README.md │ │ ├── model_inference/ │ │ │ ├── FLEX.2-preview.py │ │ │ ├── FLUX.1-Kontext-dev.py │ │ │ ├── FLUX.1-Krea-dev.py │ │ │ ├── FLUX.1-dev-AttriCtrl.py │ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py │ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py │ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.py │ │ │ ├── FLUX.1-dev-EliGen.py │ │ │ ├── FLUX.1-dev-IP-Adapter.py │ │ │ ├── FLUX.1-dev-InfiniteYou.py │ │ │ ├── FLUX.1-dev-LoRA-Encoder.py │ │ │ ├── FLUX.1-dev-LoRA-Fusion.py │ │ │ ├── FLUX.1-dev.py │ │ │ ├── Nexus-Gen-Editing.py │ │ │ ├── Nexus-Gen-Generation.py │ │ │ └── Step1X-Edit.py │ │ ├── model_inference_low_vram/ │ │ │ ├── FLEX.2-preview.py │ │ │ ├── FLUX.1-Kontext-dev.py │ │ │ ├── FLUX.1-Krea-dev.py │ │ │ ├── FLUX.1-dev-AttriCtrl.py │ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py │ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py │ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.py │ │ │ ├── FLUX.1-dev-EliGen.py │ │ │ ├── FLUX.1-dev-IP-Adapter.py │ │ │ ├── FLUX.1-dev-InfiniteYou.py │ │ │ ├── FLUX.1-dev-LoRA-Encoder.py │ │ │ ├── FLUX.1-dev-LoRA-Fusion.py │ │ │ ├── FLUX.1-dev.py │ │ │ ├── Nexus-Gen-Editing.py │ │ │ ├── Nexus-Gen-Generation.py │ │ │ └── Step1X-Edit.py │ │ └── model_training/ │ │ ├── full/ │ │ │ ├── FLEX.2-preview.sh │ │ │ ├── FLUX.1-Kontext-dev.sh │ │ │ ├── FLUX.1-Krea-dev.sh │ │ │ ├── FLUX.1-dev-AttriCtrl.sh │ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.sh │ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.sh │ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.sh │ │ │ ├── FLUX.1-dev-IP-Adapter.sh │ │ │ ├── FLUX.1-dev-InfiniteYou.sh │ │ │ ├── FLUX.1-dev-LoRA-Encoder.sh │ │ │ ├── FLUX.1-dev.sh │ │ │ ├── Nexus-Gen.sh │ │ │ ├── Step1X-Edit.sh │ │ │ ├── accelerate_config.yaml │ │ │ ├── accelerate_config_zero2offload.yaml │ │ │ └── accelerate_config_zero3.yaml │ │ ├── lora/ │ │ │ ├── FLEX.2-preview.sh │ │ │ ├── FLUX.1-Kontext-dev.sh │ │ │ ├── FLUX.1-Krea-dev.sh │ │ │ ├── FLUX.1-dev-AttriCtrl.sh │ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.sh │ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.sh │ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.sh │ │ │ ├── FLUX.1-dev-EliGen.sh │ │ │ ├── FLUX.1-dev-IP-Adapter.sh │ │ │ ├── FLUX.1-dev-InfiniteYou.sh │ │ │ ├── FLUX.1-dev.sh │ │ │ ├── Nexus-Gen.sh │ │ │ └── Step1X-Edit.sh │ │ ├── special/ │ │ │ └── npu_training/ │ │ │ ├── FLUX.1-Kontext-dev-NPU.sh │ │ │ └── FLUX.1-dev-NPU.sh │ │ ├── train.py │ │ ├── validate_full/ │ │ │ ├── FLEX.2-preview.py │ │ │ ├── FLUX.1-Kontext-dev.py │ │ │ ├── FLUX.1-Krea-dev.py │ │ │ ├── FLUX.1-dev-AttriCtrl.py │ │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py │ │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py │ │ │ ├── FLUX.1-dev-Controlnet-Upscaler.py │ │ │ ├── FLUX.1-dev-IP-Adapter.py │ │ │ ├── FLUX.1-dev-InfiniteYou.py │ │ │ ├── FLUX.1-dev-LoRA-Encoder.py │ │ │ ├── FLUX.1-dev.py │ │ │ ├── Nexus-Gen.py │ │ │ └── Step1X-Edit.py │ │ └── validate_lora/ │ │ ├── FLEX.2-preview.py │ │ ├── FLUX.1-Kontext-dev.py │ │ ├── FLUX.1-Krea-dev.py │ │ ├── FLUX.1-dev-AttriCtrl.py │ │ ├── FLUX.1-dev-Controlnet-Inpainting-Beta.py │ │ ├── FLUX.1-dev-Controlnet-Union-alpha.py │ │ ├── FLUX.1-dev-Controlnet-Upscaler.py │ │ ├── FLUX.1-dev-EliGen.py │ │ ├── FLUX.1-dev-IP-Adapter.py │ │ ├── FLUX.1-dev-InfiniteYou.py │ │ ├── FLUX.1-dev.py │ │ ├── Nexus-Gen.py │ │ └── Step1X-Edit.py │ ├── flux2/ │ │ ├── README.md │ │ ├── model_inference/ │ │ │ ├── FLUX.2-dev.py │ │ │ ├── FLUX.2-klein-4B.py │ │ │ ├── FLUX.2-klein-9B.py │ │ │ ├── FLUX.2-klein-base-4B.py │ │ │ └── FLUX.2-klein-base-9B.py │ │ ├── model_inference_low_vram/ │ │ │ ├── FLUX.2-dev.py │ │ │ ├── FLUX.2-klein-4B.py │ │ │ ├── FLUX.2-klein-9B.py │ │ │ ├── FLUX.2-klein-base-4B.py │ │ │ └── FLUX.2-klein-base-9B.py │ │ └── model_training/ │ │ ├── full/ │ │ │ ├── FLUX.2-klein-4B.sh │ │ │ ├── FLUX.2-klein-9B.sh │ │ │ ├── FLUX.2-klein-base-4B.sh │ │ │ ├── FLUX.2-klein-base-9B.sh │ │ │ ├── accelerate_config.yaml │ │ │ └── accelerate_config_zero3.yaml │ │ ├── lora/ │ │ │ ├── FLUX.2-dev.sh │ │ │ ├── FLUX.2-klein-4B.sh │ │ │ ├── FLUX.2-klein-9B.sh │ │ │ ├── FLUX.2-klein-base-4B.sh │ │ │ └── FLUX.2-klein-base-9B.sh │ │ ├── special/ │ │ │ └── npu_training/ │ │ │ ├── FLUX.2-dev-Lora-NPU.sh │ │ │ └── FLUX.2-klein-9B-NPU.sh │ │ ├── train.py │ │ ├── validate_full/ │ │ │ ├── FLUX.2-klein-4B.py │ │ │ ├── FLUX.2-klein-9B.py │ │ │ ├── FLUX.2-klein-base-4B.py │ │ │ └── FLUX.2-klein-base-9B.py │ │ └── validate_lora/ │ │ ├── FLUX.2-dev.py │ │ ├── FLUX.2-klein-4B.py │ │ ├── FLUX.2-klein-9B.py │ │ ├── FLUX.2-klein-base-4B.py │ │ └── FLUX.2-klein-base-9B.py │ ├── ltx2/ │ │ ├── README.md │ │ ├── model_inference/ │ │ │ ├── LTX-2-I2AV-DistilledPipeline.py │ │ │ ├── LTX-2-I2AV-OneStage.py │ │ │ ├── LTX-2-I2AV-TwoStage.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-In.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Left.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Out.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Right.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Down.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Up.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Static.py │ │ │ ├── LTX-2-T2AV-DistilledPipeline.py │ │ │ ├── LTX-2-T2AV-IC-LoRA-Detailer.py │ │ │ ├── LTX-2-T2AV-IC-LoRA-Union-Control.py │ │ │ ├── LTX-2-T2AV-OneStage.py │ │ │ ├── LTX-2-T2AV-TwoStage.py │ │ │ ├── LTX-2.3-A2V-TwoStage.py │ │ │ ├── LTX-2.3-I2AV-DistilledPipeline.py │ │ │ ├── LTX-2.3-I2AV-OneStage.py │ │ │ ├── LTX-2.3-I2AV-TwoStage.py │ │ │ ├── LTX-2.3-T2AV-DistilledPipeline.py │ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py │ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Union-Control.py │ │ │ ├── LTX-2.3-T2AV-OneStage.py │ │ │ ├── LTX-2.3-T2AV-TwoStage-Retake.py │ │ │ └── LTX-2.3-T2AV-TwoStage.py │ │ ├── model_inference_low_vram/ │ │ │ ├── LTX-2-I2AV-DistilledPipeline.py │ │ │ ├── LTX-2-I2AV-OneStage.py │ │ │ ├── LTX-2-I2AV-TwoStage.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-In.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Left.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Out.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Dolly-Right.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Down.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Jib-Up.py │ │ │ ├── LTX-2-T2AV-Camera-Control-Static.py │ │ │ ├── LTX-2-T2AV-DistilledPipeline.py │ │ │ ├── LTX-2-T2AV-IC-LoRA-Detailer.py │ │ │ ├── LTX-2-T2AV-IC-LoRA-Union-Control.py │ │ │ ├── LTX-2-T2AV-OneStage.py │ │ │ ├── LTX-2-T2AV-TwoStage.py │ │ │ ├── LTX-2.3-A2V-TwoStage.py │ │ │ ├── LTX-2.3-I2AV-DistilledPipeline.py │ │ │ ├── LTX-2.3-I2AV-OneStage.py │ │ │ ├── LTX-2.3-I2AV-TwoStage.py │ │ │ ├── LTX-2.3-T2AV-DistilledPipeline.py │ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py │ │ │ ├── LTX-2.3-T2AV-IC-LoRA-Union-Control.py │ │ │ ├── LTX-2.3-T2AV-OneStage.py │ │ │ ├── LTX-2.3-T2AV-TwoStage-Retake.py │ │ │ └── LTX-2.3-T2AV-TwoStage.py │ │ └── model_training/ │ │ ├── full/ │ │ │ ├── LTX-2-T2AV-splited.sh │ │ │ ├── LTX-2.3-I2AV-splited.sh │ │ │ └── LTX-2.3-T2AV-splited.sh │ │ ├── lora/ │ │ │ ├── LTX-2-T2AV-IC-LoRA-splited.sh │ │ │ ├── LTX-2-T2AV-noaudio.sh │ │ │ ├── LTX-2-T2AV-splited.sh │ │ │ ├── LTX-2.3-I2AV-splited.sh │ │ │ ├── LTX-2.3-T2AV-IC-LoRA-splited.sh │ │ │ └── LTX-2.3-T2AV-splited.sh │ │ ├── scripts/ │ │ │ ├── split_model_statedicts.py │ │ │ └── split_model_statedicts_ltx2.3.py │ │ ├── train.py │ │ ├── validate_full/ │ │ │ ├── LTX-2-T2AV.py │ │ │ ├── LTX-2.3-I2AV.py │ │ │ └── LTX-2.3-T2AV.py │ │ └── validate_lora/ │ │ ├── LTX-2-T2AV-IC-LoRA.py │ │ ├── LTX-2-T2AV.py │ │ ├── LTX-2-T2AV_noaudio.py │ │ ├── LTX-2.3-I2AV.py │ │ ├── LTX-2.3-T2AV-IC-LoRA.py │ │ └── LTX-2.3-T2AV.py │ ├── mova/ │ │ ├── README.md │ │ ├── acceleration/ │ │ │ └── unified_sequence_parallel.py │ │ ├── model_inference/ │ │ │ ├── MOVA-360p-I2AV.py │ │ │ └── MOVA-720p-I2AV.py │ │ ├── model_inference_low_vram/ │ │ │ ├── MOVA-360p-I2AV.py │ │ │ └── MOVA-720p-I2AV.py │ │ └── model_training/ │ │ ├── full/ │ │ │ ├── MOVA-360P-I2AV.sh │ │ │ └── MOVA-720P-I2AV.sh │ │ ├── lora/ │ │ │ ├── MOVA-360P-I2AV.sh │ │ │ └── MOVA-720P-I2AV.sh │ │ ├── train.py │ │ ├── validate_full/ │ │ │ ├── MOVA-360p-I2AV.py │ │ │ └── MOVA-720p-I2AV.py │ │ └── validate_lora/ │ │ ├── MOVA-360p-I2AV.py │ │ └── MOVA-720p-I2AV.py │ ├── qwen_image/ │ │ ├── README.md │ │ ├── model_inference/ │ │ │ ├── FireRed-Image-Edit-1.0.py │ │ │ ├── FireRed-Image-Edit-1.1.py │ │ │ ├── Qwen-Image-2512.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-InpaintCanny.py │ │ │ ├── Qwen-Image-Distill-DMD2.py │ │ │ ├── Qwen-Image-Distill-Full.py │ │ │ ├── Qwen-Image-Distill-LoRA.py │ │ │ ├── Qwen-Image-Edit-2509.py │ │ │ ├── Qwen-Image-Edit-2511-ICEdit.py │ │ │ ├── Qwen-Image-Edit-2511-Lightning.py │ │ │ ├── Qwen-Image-Edit-2511.py │ │ │ ├── Qwen-Image-Edit-Lowres-Fix.py │ │ │ ├── Qwen-Image-Edit.py │ │ │ ├── Qwen-Image-EliGen-Poster.py │ │ │ ├── Qwen-Image-EliGen-V2.py │ │ │ ├── Qwen-Image-EliGen.py │ │ │ ├── Qwen-Image-In-Context-Control-Union.py │ │ │ ├── Qwen-Image-Layered-Control-V2.py │ │ │ ├── Qwen-Image-Layered-Control.py │ │ │ ├── Qwen-Image-Layered.py │ │ │ ├── Qwen-Image-i2L.py │ │ │ └── Qwen-Image.py │ │ ├── model_inference_low_vram/ │ │ │ ├── FireRed-Image-Edit-1.0.py │ │ │ ├── FireRed-Image-Edit-1.1.py │ │ │ ├── Qwen-Image-2512.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-InpaintCanny.py │ │ │ ├── Qwen-Image-Distill-DMD2.py │ │ │ ├── Qwen-Image-Distill-Full.py │ │ │ ├── Qwen-Image-Distill-LoRA.py │ │ │ ├── Qwen-Image-Edit-2509.py │ │ │ ├── Qwen-Image-Edit-2511-ICEdit.py │ │ │ ├── Qwen-Image-Edit-2511-Lightning.py │ │ │ ├── Qwen-Image-Edit-2511.py │ │ │ ├── Qwen-Image-Edit-Lowres-Fix.py │ │ │ ├── Qwen-Image-Edit.py │ │ │ ├── Qwen-Image-EliGen-Poster.py │ │ │ ├── Qwen-Image-EliGen-V2.py │ │ │ ├── Qwen-Image-EliGen.py │ │ │ ├── Qwen-Image-In-Context-Control-Union.py │ │ │ ├── Qwen-Image-Layered-Control-V2.py │ │ │ ├── Qwen-Image-Layered-Control.py │ │ │ ├── Qwen-Image-Layered.py │ │ │ ├── Qwen-Image-i2L.py │ │ │ └── Qwen-Image.py │ │ └── model_training/ │ │ ├── full/ │ │ │ ├── FireRed-Image-Edit-1.0.sh │ │ │ ├── FireRed-Image-Edit-1.1.sh │ │ │ ├── Qwen-Image-2512.sh │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.sh │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.sh │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.sh │ │ │ ├── Qwen-Image-Distill-Full.sh │ │ │ ├── Qwen-Image-Edit-2509.sh │ │ │ ├── Qwen-Image-Edit-2511.sh │ │ │ ├── Qwen-Image-Edit.sh │ │ │ ├── Qwen-Image-Layered-Control.sh │ │ │ ├── Qwen-Image-Layered.sh │ │ │ ├── Qwen-Image.sh │ │ │ ├── accelerate_config.yaml │ │ │ ├── accelerate_config_zero2offload.yaml │ │ │ └── accelerate_config_zero3.yaml │ │ ├── lora/ │ │ │ ├── FireRed-Image-Edit-1.0.sh │ │ │ ├── FireRed-Image-Edit-1.1.sh │ │ │ ├── Qwen-Image-2512.sh │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.sh │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.sh │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.sh │ │ │ ├── Qwen-Image-Distill-Full.sh │ │ │ ├── Qwen-Image-Distill-LoRA.sh │ │ │ ├── Qwen-Image-Edit-2509.sh │ │ │ ├── Qwen-Image-Edit-2511.sh │ │ │ ├── Qwen-Image-Edit.sh │ │ │ ├── Qwen-Image-EliGen-Poster.sh │ │ │ ├── Qwen-Image-EliGen.sh │ │ │ ├── Qwen-Image-In-Context-Control-Union.sh │ │ │ ├── Qwen-Image-Layered-Control-V2.sh │ │ │ ├── Qwen-Image-Layered-Control.sh │ │ │ ├── Qwen-Image-Layered.sh │ │ │ └── Qwen-Image.sh │ │ ├── scripts/ │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Initialize.py │ │ │ └── Qwen-Image-Blockwise-ControlNet-Inpaint-Initialize.py │ │ ├── special/ │ │ │ ├── differential_training/ │ │ │ │ └── Qwen-Image-LoRA.sh │ │ │ ├── fp8_training/ │ │ │ │ ├── Qwen-Image-LoRA.sh │ │ │ │ └── validate.py │ │ │ ├── low_vram_training/ │ │ │ │ ├── Qwen-Image-LoRA.sh │ │ │ │ ├── deepspeed_zero3_cpuoffload.yaml │ │ │ │ └── ds_z3_cpuoffload.json │ │ │ ├── npu_training/ │ │ │ │ ├── Qwen-Image-Edit-2509-LoRA-NPU.sh │ │ │ │ ├── Qwen-Image-Edit-2509-NPU.sh │ │ │ │ └── Qwen-Image-LoRA-NPU.sh │ │ │ ├── simple/ │ │ │ │ └── train.py │ │ │ └── split_training/ │ │ │ ├── Qwen-Image-LoRA.sh │ │ │ └── validate.py │ │ ├── train.py │ │ ├── validate_full/ │ │ │ ├── FireRed-Image-Edit-1.0.py │ │ │ ├── FireRed-Image-Edit-1.1.py │ │ │ ├── Qwen-Image-2512.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py │ │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py │ │ │ ├── Qwen-Image-Distill-Full.py │ │ │ ├── Qwen-Image-Edit-2509.py │ │ │ ├── Qwen-Image-Edit-2511.py │ │ │ ├── Qwen-Image-Edit.py │ │ │ ├── Qwen-Image-Layered-Control.py │ │ │ ├── Qwen-Image-Layered.py │ │ │ └── Qwen-Image.py │ │ └── validate_lora/ │ │ ├── FireRed-Image-Edit-1.0.py │ │ ├── FireRed-Image-Edit-1.1.py │ │ ├── Qwen-Image-2512.py │ │ ├── Qwen-Image-Blockwise-ControlNet-Canny.py │ │ ├── Qwen-Image-Blockwise-ControlNet-Depth.py │ │ ├── Qwen-Image-Blockwise-ControlNet-Inpaint.py │ │ ├── Qwen-Image-Distill-Full.py │ │ ├── Qwen-Image-Distill-LoRA.py │ │ ├── Qwen-Image-Edit-2509.py │ │ ├── Qwen-Image-Edit-2511.py │ │ ├── Qwen-Image-Edit.py │ │ ├── Qwen-Image-EliGen-Poster.py │ │ ├── Qwen-Image-EliGen.py │ │ ├── Qwen-Image-In-Context-Control-Union.py │ │ ├── Qwen-Image-Layered-Control-V2.py │ │ ├── Qwen-Image-Layered-Control.py │ │ ├── Qwen-Image-Layered.py │ │ └── Qwen-Image.py │ ├── wanvideo/ │ │ ├── README.md │ │ ├── acceleration/ │ │ │ └── unified_sequence_parallel.py │ │ ├── model_inference/ │ │ │ ├── LongCat-Video.py │ │ │ ├── Video-As-Prompt-Wan2.1-14B.py │ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.py │ │ │ ├── Wan2.1-FLF2V-14B-720P.py │ │ │ ├── Wan2.1-Fun-1.3B-Control.py │ │ │ ├── Wan2.1-Fun-1.3B-InP.py │ │ │ ├── Wan2.1-Fun-14B-Control.py │ │ │ ├── Wan2.1-Fun-14B-InP.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.py │ │ │ ├── Wan2.1-I2V-14B-480P.py │ │ │ ├── Wan2.1-I2V-14B-720P.py │ │ │ ├── Wan2.1-T2V-1.3B.py │ │ │ ├── Wan2.1-T2V-14B.py │ │ │ ├── Wan2.1-VACE-1.3B-Preview.py │ │ │ ├── Wan2.1-VACE-1.3B.py │ │ │ ├── Wan2.1-VACE-14B.py │ │ │ ├── Wan2.2-Animate-14B.py │ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.py │ │ │ ├── Wan2.2-Fun-A14B-Control.py │ │ │ ├── Wan2.2-Fun-A14B-InP.py │ │ │ ├── Wan2.2-I2V-A14B.py │ │ │ ├── Wan2.2-S2V-14B.py │ │ │ ├── Wan2.2-S2V-14B_multi_clips.py │ │ │ ├── Wan2.2-T2V-A14B.py │ │ │ ├── Wan2.2-TI2V-5B.py │ │ │ ├── Wan2.2-VACE-Fun-A14B.py │ │ │ ├── WanToDance-14B-global.py │ │ │ ├── WanToDance-14B-local.py │ │ │ └── krea-realtime-video.py │ │ ├── model_inference_low_vram/ │ │ │ ├── LongCat-Video.py │ │ │ ├── Video-As-Prompt-Wan2.1-14B.py │ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.py │ │ │ ├── Wan2.1-FLF2V-14B-720P.py │ │ │ ├── Wan2.1-Fun-1.3B-Control.py │ │ │ ├── Wan2.1-Fun-1.3B-InP.py │ │ │ ├── Wan2.1-Fun-14B-Control.py │ │ │ ├── Wan2.1-Fun-14B-InP.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.py │ │ │ ├── Wan2.1-I2V-14B-480P.py │ │ │ ├── Wan2.1-I2V-14B-720P.py │ │ │ ├── Wan2.1-T2V-1.3B.py │ │ │ ├── Wan2.1-T2V-14B.py │ │ │ ├── Wan2.1-VACE-1.3B-Preview.py │ │ │ ├── Wan2.1-VACE-1.3B.py │ │ │ ├── Wan2.1-VACE-14B.py │ │ │ ├── Wan2.2-Animate-14B.py │ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.py │ │ │ ├── Wan2.2-Fun-A14B-Control.py │ │ │ ├── Wan2.2-Fun-A14B-InP.py │ │ │ ├── Wan2.2-I2V-A14B.py │ │ │ ├── Wan2.2-S2V-14B.py │ │ │ ├── Wan2.2-S2V-14B_multi_clips.py │ │ │ ├── Wan2.2-T2V-A14B.py │ │ │ ├── Wan2.2-TI2V-5B.py │ │ │ ├── Wan2.2-VACE-Fun-A14B.py │ │ │ ├── WanToDance-14B-global.py │ │ │ ├── WanToDance-14B-local.py │ │ │ └── krea-realtime-video.py │ │ └── model_training/ │ │ ├── full/ │ │ │ ├── LongCat-Video.sh │ │ │ ├── Video-As-Prompt-Wan2.1-14B.sh │ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.sh │ │ │ ├── Wan2.1-FLF2V-14B-720P.sh │ │ │ ├── Wan2.1-Fun-1.3B-Control.sh │ │ │ ├── Wan2.1-Fun-1.3B-InP.sh │ │ │ ├── Wan2.1-Fun-14B-Control.sh │ │ │ ├── Wan2.1-Fun-14B-InP.sh │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.sh │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.sh │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.sh │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.sh │ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.sh │ │ │ ├── Wan2.1-I2V-14B-480P.sh │ │ │ ├── Wan2.1-I2V-14B-720P.sh │ │ │ ├── Wan2.1-T2V-1.3B.sh │ │ │ ├── Wan2.1-T2V-14B.sh │ │ │ ├── Wan2.1-VACE-1.3B-Preview.sh │ │ │ ├── Wan2.1-VACE-1.3B.sh │ │ │ ├── Wan2.1-VACE-14B.sh │ │ │ ├── Wan2.2-Animate-14B.sh │ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.sh │ │ │ ├── Wan2.2-Fun-A14B-Control.sh │ │ │ ├── Wan2.2-Fun-A14B-InP.sh │ │ │ ├── Wan2.2-I2V-A14B.sh │ │ │ ├── Wan2.2-S2V-14B.sh │ │ │ ├── Wan2.2-T2V-A14B.sh │ │ │ ├── Wan2.2-TI2V-5B.sh │ │ │ ├── Wan2.2-VACE-Fun-A14B.sh │ │ │ ├── WanToDance-14B-global.sh │ │ │ ├── WanToDance-14B-local.sh │ │ │ ├── accelerate_config_14B.yaml │ │ │ ├── accelerate_config_zero3.yaml │ │ │ └── krea-realtime-video.sh │ │ ├── lora/ │ │ │ ├── LongCat-Video.sh │ │ │ ├── Video-As-Prompt-Wan2.1-14B.sh │ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.sh │ │ │ ├── Wan2.1-FLF2V-14B-720P.sh │ │ │ ├── Wan2.1-Fun-1.3B-Control.sh │ │ │ ├── Wan2.1-Fun-1.3B-InP.sh │ │ │ ├── Wan2.1-Fun-14B-Control.sh │ │ │ ├── Wan2.1-Fun-14B-InP.sh │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.sh │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.sh │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.sh │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.sh │ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.sh │ │ │ ├── Wan2.1-I2V-14B-480P.sh │ │ │ ├── Wan2.1-I2V-14B-720P.sh │ │ │ ├── Wan2.1-T2V-1.3B.sh │ │ │ ├── Wan2.1-T2V-14B.sh │ │ │ ├── Wan2.1-VACE-1.3B-Preview.sh │ │ │ ├── Wan2.1-VACE-1.3B.sh │ │ │ ├── Wan2.1-VACE-14B.sh │ │ │ ├── Wan2.2-Animate-14B.sh │ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.sh │ │ │ ├── Wan2.2-Fun-A14B-Control.sh │ │ │ ├── Wan2.2-Fun-A14B-InP.sh │ │ │ ├── Wan2.2-I2V-A14B.sh │ │ │ ├── Wan2.2-S2V-14B.sh │ │ │ ├── Wan2.2-T2V-A14B.sh │ │ │ ├── Wan2.2-TI2V-5B.sh │ │ │ ├── Wan2.2-VACE-Fun-A14B.sh │ │ │ ├── WanToDance-14B-global.sh │ │ │ ├── WanToDance-14B-local.sh │ │ │ └── krea-realtime-video.sh │ │ ├── special/ │ │ │ ├── direct_distill/ │ │ │ │ ├── Wan2.1-T2V-1.3B.sh │ │ │ │ └── validate.py │ │ │ ├── fp8_training/ │ │ │ │ ├── Wan2.1-I2V-14B-480P.sh │ │ │ │ └── validate.py │ │ │ ├── low_vram_training/ │ │ │ │ ├── Wan2.1-I2V-14B-480P.sh │ │ │ │ └── validate.py │ │ │ ├── npu_training/ │ │ │ │ ├── Wan2.1-T2V-14B-NPU.sh │ │ │ │ ├── Wan2.2-T2V-A14B-NPU.sh │ │ │ │ └── Wan2.2-VACE-Fun-A14B-NPU.sh │ │ │ └── split_training/ │ │ │ ├── Wan2.1-I2V-14B-480P.sh │ │ │ └── validate.py │ │ ├── train.py │ │ ├── validate_full/ │ │ │ ├── LongCat-Video.py │ │ │ ├── Video-As-Prompt-Wan2.1-14B.py │ │ │ ├── Wan2.1-1.3b-speedcontrol-v1.py │ │ │ ├── Wan2.1-FLF2V-14B-720P.py │ │ │ ├── Wan2.1-Fun-1.3B-Control.py │ │ │ ├── Wan2.1-Fun-1.3B-InP.py │ │ │ ├── Wan2.1-Fun-14B-Control.py │ │ │ ├── Wan2.1-Fun-14B-InP.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py │ │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-Control.py │ │ │ ├── Wan2.1-Fun-V1.1-14B-InP.py │ │ │ ├── Wan2.1-I2V-14B-480P.py │ │ │ ├── Wan2.1-I2V-14B-720P.py │ │ │ ├── Wan2.1-T2V-1.3B.py │ │ │ ├── Wan2.1-T2V-14B.py │ │ │ ├── Wan2.1-VACE-1.3B-Preview.py │ │ │ ├── Wan2.1-VACE-1.3B.py │ │ │ ├── Wan2.1-VACE-14B.py │ │ │ ├── Wan2.2-Animate-14B.py │ │ │ ├── Wan2.2-Fun-A14B-Control-Camera.py │ │ │ ├── Wan2.2-Fun-A14B-Control.py │ │ │ ├── Wan2.2-Fun-A14B-InP.py │ │ │ ├── Wan2.2-I2V-A14B.py │ │ │ ├── Wan2.2-S2V-14B.py │ │ │ ├── Wan2.2-T2V-A14B.py │ │ │ ├── Wan2.2-TI2V-5B.py │ │ │ ├── Wan2.2-VACE-Fun-A14B.py │ │ │ ├── WanToDance-14B-global.py │ │ │ ├── WanToDance-14B-local.py │ │ │ └── krea-realtime-video.py │ │ └── validate_lora/ │ │ ├── LongCat-Video.py │ │ ├── Video-As-Prompt-Wan2.1-14B.py │ │ ├── Wan2.1-1.3b-speedcontrol-v1.py │ │ ├── Wan2.1-FLF2V-14B-720P.py │ │ ├── Wan2.1-Fun-1.3B-Control.py │ │ ├── Wan2.1-Fun-1.3B-InP.py │ │ ├── Wan2.1-Fun-14B-Control.py │ │ ├── Wan2.1-Fun-14B-InP.py │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control-Camera.py │ │ ├── Wan2.1-Fun-V1.1-1.3B-Control.py │ │ ├── Wan2.1-Fun-V1.1-1.3B-InP.py │ │ ├── Wan2.1-Fun-V1.1-14B-Control-Camera.py │ │ ├── Wan2.1-Fun-V1.1-14B-Control.py │ │ ├── Wan2.1-Fun-V1.1-14B-InP.py │ │ ├── Wan2.1-I2V-14B-480P.py │ │ ├── Wan2.1-I2V-14B-720P.py │ │ ├── Wan2.1-T2V-1.3B.py │ │ ├── Wan2.1-T2V-14B.py │ │ ├── Wan2.1-VACE-1.3B-Preview.py │ │ ├── Wan2.1-VACE-1.3B.py │ │ ├── Wan2.1-VACE-14B.py │ │ ├── Wan2.2-Animate-14B.py │ │ ├── Wan2.2-Fun-A14B-Control-Camera.py │ │ ├── Wan2.2-Fun-A14B-Control.py │ │ ├── Wan2.2-Fun-A14B-InP.py │ │ ├── Wan2.2-I2V-A14B.py │ │ ├── Wan2.2-S2V-14B.py │ │ ├── Wan2.2-T2V-A14B.py │ │ ├── Wan2.2-TI2V-5B.py │ │ ├── Wan2.2-VACE-Fun-A14B.py │ │ ├── WanToDance-14B-global.py │ │ ├── WanToDance-14B-local.py │ │ └── krea-realtime-video.py │ └── z_image/ │ ├── README.md │ ├── model_inference/ │ │ ├── Z-Image-Omni-Base-i2L.py │ │ ├── Z-Image-Omni-Base.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py │ │ ├── Z-Image-Turbo.py │ │ ├── Z-Image-i2L.py │ │ └── Z-Image.py │ ├── model_inference_low_vram/ │ │ ├── Z-Image-Omni-Base-i2L.py │ │ ├── Z-Image-Omni-Base.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py │ │ ├── Z-Image-Turbo.py │ │ ├── Z-Image-i2L.py │ │ └── Z-Image.py │ └── model_training/ │ ├── full/ │ │ ├── Z-Image-Omni-Base.sh │ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh │ │ ├── Z-Image-Turbo.sh │ │ ├── Z-Image.sh │ │ ├── accelerate_config.yaml │ │ └── accelerate_config_zero3.yaml │ ├── lora/ │ │ ├── Z-Image-Omni-Base.sh │ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.sh │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.sh │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.sh │ │ ├── Z-Image-Turbo.sh │ │ └── Z-Image.sh │ ├── special/ │ │ ├── differential_training/ │ │ │ ├── Z-Image-Turbo.sh │ │ │ └── validate.py │ │ ├── npu_training/ │ │ │ └── Z-Image-Turbo-NPU.sh │ │ └── trajectory_imitation/ │ │ ├── Z-Image-Turbo.sh │ │ └── validate.py │ ├── train.py │ ├── validate_full/ │ │ ├── Z-Image-Omni-Base.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py │ │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py │ │ ├── Z-Image-Turbo.py │ │ └── Z-Image.py │ └── validate_lora/ │ ├── Z-Image-Omni-Base.py │ ├── Z-Image-Turbo-Fun-Controlnet-Tile-2.1-8steps.py │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.py │ ├── Z-Image-Turbo-Fun-Controlnet-Union-2.1.py │ ├── Z-Image-Turbo.py │ └── Z-Image.py └── pyproject.toml
Showing preview only (207K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (2502 symbols across 149 files)
FILE: diffsynth/configs/vram_management_module_maps.py
function QwenImageTextEncoder_Module_Map_Updater (line 272) | def QwenImageTextEncoder_Module_Map_Updater():
FILE: diffsynth/core/attention/attention.py
function initialize_attention_priority (line 30) | def initialize_attention_priority():
function rearrange_qkv (line 48) | def rearrange_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_p...
function rearrange_out (line 59) | def rearrange_out(out: torch.Tensor, out_pattern="b n s d", required_out...
function torch_sdpa (line 66) | def torch_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_patt...
function flash_attention_3 (line 74) | def flash_attention_3(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,...
function flash_attention_2 (line 84) | def flash_attention_2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,...
function sage_attention (line 92) | def sage_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_...
function xformers_attention (line 100) | def xformers_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor...
function attention_forward (line 108) | def attention_forward(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,...
FILE: diffsynth/core/data/operators.py
class DataProcessingPipeline (line 8) | class DataProcessingPipeline:
method __init__ (line 9) | def __init__(self, operators=None):
method __call__ (line 12) | def __call__(self, data):
method __rshift__ (line 17) | def __rshift__(self, pipe):
class DataProcessingOperator (line 23) | class DataProcessingOperator:
method __call__ (line 24) | def __call__(self, data):
method __rshift__ (line 27) | def __rshift__(self, pipe):
class DataProcessingOperatorRaw (line 33) | class DataProcessingOperatorRaw(DataProcessingOperator):
method __call__ (line 34) | def __call__(self, data):
class ToInt (line 38) | class ToInt(DataProcessingOperator):
method __call__ (line 39) | def __call__(self, data):
class ToFloat (line 43) | class ToFloat(DataProcessingOperator):
method __call__ (line 44) | def __call__(self, data):
class ToStr (line 48) | class ToStr(DataProcessingOperator):
method __init__ (line 49) | def __init__(self, none_value=""):
method __call__ (line 52) | def __call__(self, data):
class LoadImage (line 57) | class LoadImage(DataProcessingOperator):
method __init__ (line 58) | def __init__(self, convert_RGB=True, convert_RGBA=False):
method __call__ (line 62) | def __call__(self, data: str):
class ImageCropAndResize (line 69) | class ImageCropAndResize(DataProcessingOperator):
method __init__ (line 70) | def __init__(self, height=None, width=None, max_pixels=None, height_di...
method crop_and_resize (line 77) | def crop_and_resize(self, image, target_height, target_width):
method get_height_width (line 88) | def get_height_width(self, image):
method __call__ (line 100) | def __call__(self, data: Image.Image):
class ToList (line 105) | class ToList(DataProcessingOperator):
method __call__ (line 106) | def __call__(self, data):
class FrameSamplerByRateMixin (line 110) | class FrameSamplerByRateMixin:
method __init__ (line 111) | def __init__(self, num_frames=81, time_division_factor=4, time_divisio...
method get_reader (line 118) | def get_reader(self, data: str):
method get_available_num_frames (line 121) | def get_available_num_frames(self, reader):
method get_num_frames (line 130) | def get_num_frames(self, reader):
method map_single_frame_id (line 139) | def map_single_frame_id(self, new_sequence_id: int, raw_frame_rate: fl...
class LoadVideo (line 149) | class LoadVideo(DataProcessingOperator, FrameSamplerByRateMixin):
method __init__ (line 150) | def __init__(self, num_frames=81, time_division_factor=4, time_divisio...
method __call__ (line 155) | def __call__(self, data: str):
class SequencialProcess (line 171) | class SequencialProcess(DataProcessingOperator):
method __init__ (line 172) | def __init__(self, operator=lambda x: x):
method __call__ (line 175) | def __call__(self, data):
class LoadGIF (line 179) | class LoadGIF(DataProcessingOperator):
method __init__ (line 180) | def __init__(self, num_frames=81, time_division_factor=4, time_divisio...
method get_num_frames (line 187) | def get_num_frames(self, path):
method __call__ (line 196) | def __call__(self, data: str):
class RouteByExtensionName (line 209) | class RouteByExtensionName(DataProcessingOperator):
method __init__ (line 210) | def __init__(self, operator_map):
method __call__ (line 213) | def __call__(self, data: str):
class RouteByType (line 221) | class RouteByType(DataProcessingOperator):
method __init__ (line 222) | def __init__(self, operator_map):
method __call__ (line 225) | def __call__(self, data):
class LoadTorchPickle (line 232) | class LoadTorchPickle(DataProcessingOperator):
method __init__ (line 233) | def __init__(self, map_location="cpu"):
method __call__ (line 236) | def __call__(self, data):
class ToAbsolutePath (line 240) | class ToAbsolutePath(DataProcessingOperator):
method __init__ (line 241) | def __init__(self, base_path=""):
method __call__ (line 244) | def __call__(self, data):
class LoadAudio (line 248) | class LoadAudio(DataProcessingOperator):
method __init__ (line 249) | def __init__(self, sr=16000):
method __call__ (line 251) | def __call__(self, data: str):
class LoadAudioWithTorchaudio (line 257) | class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRate...
method __init__ (line 259) | def __init__(self, num_frames=121, time_division_factor=8, time_divisi...
method __call__ (line 262) | def __call__(self, data: str):
FILE: diffsynth/core/data/unified_dataset.py
class UnifiedDataset (line 5) | class UnifiedDataset(torch.utils.data.Dataset):
method __init__ (line 6) | def __init__(
method default_image_operator (line 29) | def default_image_operator(
method default_video_operator (line 40) | def default_video_operator(
method search_for_cached_data_files (line 62) | def search_for_cached_data_files(self, path):
method load_metadata (line 70) | def load_metadata(self, metadata_path):
method __getitem__ (line 89) | def __getitem__(self, data_id):
method __len__ (line 103) | def __len__(self):
method check_data_equal (line 111) | def check_data_equal(self, data1, data2):
FILE: diffsynth/core/device/npu_compatible_device.py
function is_torch_npu_available (line 6) | def is_torch_npu_available():
function get_device_type (line 19) | def get_device_type() -> str:
function get_torch_device (line 31) | def get_torch_device() -> Any:
function get_device_id (line 42) | def get_device_id() -> int:
function get_device_name (line 47) | def get_device_name() -> str:
function synchronize (line 52) | def synchronize() -> None:
function empty_cache (line 57) | def empty_cache() -> None:
function get_nccl_backend (line 62) | def get_nccl_backend() -> str:
function enable_high_precision_for_bf16 (line 72) | def enable_high_precision_for_bf16():
function parse_device_type (line 85) | def parse_device_type(device):
function parse_nccl_backend (line 97) | def parse_nccl_backend(device_type):
function get_available_device_type (line 106) | def get_available_device_type():
FILE: diffsynth/core/gradient/gradient_checkpoint.py
function create_custom_forward (line 11) | def create_custom_forward(module):
function create_custom_forward_use_reentrant (line 17) | def create_custom_forward_use_reentrant(module):
function judge_args_requires_grad (line 23) | def judge_args_requires_grad(*args):
function gradient_checkpoint_forward (line 30) | def gradient_checkpoint_forward(
FILE: diffsynth/core/loader/config.py
class ModelConfig (line 10) | class ModelConfig:
method check_input (line 28) | def check_input(self):
method parse_original_file_pattern (line 32) | def parse_original_file_pattern(self):
method parse_download_source (line 40) | def parse_download_source(self):
method parse_skip_download (line 49) | def parse_skip_download(self):
method download (line 61) | def download(self):
method require_downloading (line 84) | def require_downloading(self):
method reset_local_model_path (line 90) | def reset_local_model_path(self):
method download_if_necessary (line 96) | def download_if_necessary(self):
method vram_config (line 109) | def vram_config(self):
FILE: diffsynth/core/loader/file.py
function load_state_dict (line 5) | def load_state_dict(file_path, torch_dtype=None, device="cpu", pin_memor...
function load_state_dict_from_safetensors (line 26) | def load_state_dict_from_safetensors(file_path, torch_dtype=None, device...
function load_state_dict_from_bin (line 36) | def load_state_dict_from_bin(file_path, torch_dtype=None, device="cpu"):
function convert_state_dict_keys_to_single_str (line 52) | def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
function hash_state_dict_keys (line 68) | def hash_state_dict_keys(state_dict, with_shape=True):
function load_keys_dict (line 74) | def load_keys_dict(file_path):
function load_keys_dict_from_safetensors (line 86) | def load_keys_dict_from_safetensors(file_path):
function convert_state_dict_to_keys_dict (line 94) | def convert_state_dict_to_keys_dict(state_dict):
function load_keys_dict_from_bin (line 104) | def load_keys_dict_from_bin(file_path):
function convert_keys_dict_to_single_str (line 110) | def convert_keys_dict_to_single_str(state_dict, with_shape=True):
function hash_model_file (line 126) | def hash_model_file(path, with_shape=True):
FILE: diffsynth/core/loader/model.py
function load_model (line 11) | def load_model(model_class, path, config=None, torch_dtype=torch.bfloat1...
function load_model_with_disk_offload (line 68) | def load_model_with_disk_offload(model_class, path, config=None, torch_d...
function get_init_context (line 91) | def get_init_context(torch_dtype, device):
FILE: diffsynth/core/npu_patch/npu_fused_operator.py
function rms_norm_forward_npu (line 9) | def rms_norm_forward_npu(self, hidden_states):
function rms_norm_forward_transformers_npu (line 16) | def rms_norm_forward_transformers_npu(self, hidden_states):
function rotary_emb_Zimage_npu (line 23) | def rotary_emb_Zimage_npu(self, x_in: torch.Tensor, freqs_cis: torch.Ten...
FILE: diffsynth/core/vram/disk_map.py
class SafetensorsCompatibleTensor (line 5) | class SafetensorsCompatibleTensor:
method __init__ (line 6) | def __init__(self, tensor):
method get_shape (line 9) | def get_shape(self):
class SafetensorsCompatibleBinaryLoader (line 13) | class SafetensorsCompatibleBinaryLoader:
method __init__ (line 14) | def __init__(self, path, device):
method keys (line 18) | def keys(self):
method get_tensor (line 21) | def get_tensor(self, name):
method get_slice (line 24) | def get_slice(self, name):
class DiskMap (line 28) | class DiskMap:
method __init__ (line 30) | def __init__(self, path, device, torch_dtype=None, state_dict_converte...
method flush_files (line 46) | def flush_files(self):
method __getitem__ (line 59) | def __getitem__(self, name):
method fetch_rename_dict (line 73) | def fetch_rename_dict(self, state_dict_converter):
method __iter__ (line 83) | def __iter__(self):
method __contains__ (line 89) | def __contains__(self, x):
FILE: diffsynth/core/vram/initialization.py
function skip_model_initialization (line 6) | def skip_model_initialization(device=torch.device("meta")):
FILE: diffsynth/core/vram/layers.py
class AutoTorchModule (line 8) | class AutoTorchModule(torch.nn.Module):
method __init__ (line 10) | def __init__(
method set_dtype_and_device (line 38) | def set_dtype_and_device(
method cast_to (line 60) | def cast_to(self, weight, dtype, device):
method check_free_vram (line 65) | def check_free_vram(self):
method offload (line 71) | def offload(self):
method onload (line 76) | def onload(self):
method param_name (line 81) | def param_name(self, name):
class AutoWrappedModule (line 88) | class AutoWrappedModule(AutoTorchModule):
method __init__ (line 90) | def __init__(
method load_from_disk (line 126) | def load_from_disk(self, torch_dtype, device, copy_module=False):
method offload_to_disk (line 140) | def offload_to_disk(self, model: torch.nn.Module):
method offload (line 150) | def offload(self):
method onload (line 159) | def onload(self):
method preparing (line 168) | def preparing(self):
method cast_to (line 177) | def cast_to(self, module, dtype, device):
method computation (line 180) | def computation(self):
method forward (line 194) | def forward(self, *args, **kwargs):
method __getattr__ (line 200) | def __getattr__(self, name):
class AutoWrappedNonRecurseModule (line 207) | class AutoWrappedNonRecurseModule(AutoWrappedModule):
method __init__ (line 209) | def __init__(
method load_from_disk (line 243) | def load_from_disk(self, torch_dtype, device, copy_module=False):
method offload_to_disk (line 256) | def offload_to_disk(self, model: torch.nn.Module):
method cast_to (line 260) | def cast_to(self, module, dtype, device):
method __getattr__ (line 264) | def __getattr__(self, name):
class AutoWrappedLinear (line 271) | class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule):
method __init__ (line 272) | def __init__(
method fp8_linear (line 321) | def fp8_linear(
method load_from_disk (line 359) | def load_from_disk(self, torch_dtype, device, assign=True):
method offload (line 368) | def offload(self):
method onload (line 377) | def onload(self):
method preparing (line 386) | def preparing(self):
method computation (line 395) | def computation(self):
method linear_forward (line 410) | def linear_forward(self, x, weight, bias):
method lora_forward (line 417) | def lora_forward(self, x, out):
method forward (line 429) | def forward(self, x, *args, **kwargs):
function enable_vram_management_recursively (line 439) | def enable_vram_management_recursively(model: torch.nn.Module, module_ma...
function fill_vram_config (line 455) | def fill_vram_config(model, vram_config):
function enable_vram_management (line 468) | def enable_vram_management(model: torch.nn.Module, module_map: dict, vra...
FILE: diffsynth/diffusion/base_pipeline.py
class PipelineUnit (line 14) | class PipelineUnit:
method __init__ (line 15) | def __init__(
method fetch_input_params (line 33) | def fetch_input_params(self):
method fetch_output_params (line 47) | def fetch_output_params(self):
method process (line 54) | def process(self, pipe, **kwargs) -> dict:
method post_process (line 57) | def post_process(self, pipe, **kwargs) -> dict:
class BasePipeline (line 61) | class BasePipeline(torch.nn.Module):
method __init__ (line 63) | def __init__(
method to (line 87) | def to(self, *args, **kwargs):
method check_resize_height_width (line 97) | def check_resize_height_width(self, height, width, num_frames=None, ve...
method preprocess_image (line 117) | def preprocess_image(self, image, torch_dtype=None, device=None, patte...
method preprocess_video (line 126) | def preprocess_video(self, video, torch_dtype=None, device=None, patte...
method vae_output_to_image (line 133) | def vae_output_to_image(self, vae_output, pattern="B C H W", min_value...
method vae_output_to_video (line 143) | def vae_output_to_video(self, vae_output, pattern="B C T H W", min_val...
method output_audio_format_check (line 150) | def output_audio_format_check(self, audio_output):
method load_models_to_device (line 157) | def load_models_to_device(self, model_names):
method generate_noise (line 182) | def generate_noise(self, shape, seed=None, rand_device="cpu", rand_tor...
method get_vram (line 190) | def get_vram(self):
method get_module (line 194) | def get_module(self, model, name):
method freeze_except (line 204) | def freeze_except(self, model_names):
method blend_with_mask (line 216) | def blend_with_mask(self, base, addition, mask):
method step (line 220) | def step(self, scheduler, latents, progress_id, noise_pred, input_late...
method split_pipeline_units (line 229) | def split_pipeline_units(self, model_names: list[str]):
method flush_vram_management_device (line 233) | def flush_vram_management_device(self, device):
method load_lora (line 242) | def load_lora(
method clear_lora (line 282) | def clear_lora(self, verbose=1):
method download_and_load_models (line 296) | def download_and_load_models(self, model_configs: list[ModelConfig] = ...
method check_vram_management_state (line 313) | def check_vram_management_state(self):
method cfg_guided_model_fn (line 321) | def cfg_guided_model_fn(self, model_fn, cfg_scale, inputs_shared, inpu...
class PipelineUnitGraph (line 343) | class PipelineUnitGraph:
method __init__ (line 344) | def __init__(self):
method build_edges (line 347) | def build_edges(self, units: list[PipelineUnit]):
method build_chains (line 360) | def build_chains(self, units: list[PipelineUnit]):
method search_direct_unit_ids (line 371) | def search_direct_unit_ids(self, units: list[PipelineUnit], model_name...
method search_related_unit_ids (line 381) | def search_related_unit_ids(self, edges, start_unit_ids, direction="ta...
method search_updating_unit_ids (line 399) | def search_updating_unit_ids(self, units: list[PipelineUnit], chains, ...
method split_pipeline_units (line 419) | def split_pipeline_units(self, units: list[PipelineUnit], model_names:...
class PipelineUnitRunner (line 438) | class PipelineUnitRunner:
method __init__ (line 439) | def __init__(self):
method __call__ (line 442) | def __call__(self, unit: PipelineUnit, pipe: BasePipeline, inputs_shar...
FILE: diffsynth/diffusion/flow_match.py
class FlowMatchScheduler (line 5) | class FlowMatchScheduler():
method __init__ (line 7) | def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "F...
method set_timesteps_flux (line 20) | def set_timesteps_flux(num_inference_steps=100, denoising_strength=1.0...
method set_timesteps_wan (line 32) | def set_timesteps_wan(num_inference_steps=100, denoising_strength=1.0,...
method _calculate_shift_qwen_image (line 44) | def _calculate_shift_qwen_image(image_seq_len, base_seq_len=256, max_s...
method set_timesteps_qwen_image (line 51) | def set_timesteps_qwen_image(num_inference_steps=100, denoising_streng...
method set_timesteps_qwen_image_lightning (line 76) | def set_timesteps_qwen_image_lightning(num_inference_steps=100, denois...
method compute_empirical_mu (line 98) | def compute_empirical_mu(image_seq_len, num_steps):
method set_timesteps_flux2 (line 116) | def set_timesteps_flux2(num_inference_steps=100, denoising_strength=1....
method set_timesteps_z_image (line 133) | def set_timesteps_z_image(num_inference_steps=100, denoising_strength=...
method set_timesteps_ltx2 (line 150) | def set_timesteps_ltx2(num_inference_steps=100, denoising_strength=1.0...
method set_training_weight (line 177) | def set_training_weight(self):
method set_timesteps (line 189) | def set_timesteps(self, num_inference_steps=100, denoising_strength=1....
method step (line 201) | def step(self, model_output, timestep, sample, to_final=False, **kwargs):
method return_to_timestep (line 213) | def return_to_timestep(self, timestep, sample, sample_stablized):
method add_noise (line 221) | def add_noise(self, original_samples, noise, timestep):
method training_target (line 229) | def training_target(self, sample, noise, timestep):
method training_weight (line 233) | def training_weight(self, timestep):
FILE: diffsynth/diffusion/logger.py
class ModelLogger (line 5) | class ModelLogger:
method __init__ (line 6) | def __init__(self, output_path, remove_prefix_in_ckpt=None, state_dict...
method on_step_end (line 13) | def on_step_end(self, accelerator: Accelerator, model: torch.nn.Module...
method on_epoch_end (line 19) | def on_epoch_end(self, accelerator: Accelerator, model: torch.nn.Modul...
method on_training_end (line 30) | def on_training_end(self, accelerator: Accelerator, model: torch.nn.Mo...
method save_model (line 35) | def save_model(self, accelerator: Accelerator, model: torch.nn.Module,...
FILE: diffsynth/diffusion/loss.py
function FlowMatchSFTLoss (line 5) | def FlowMatchSFTLoss(pipe: BasePipeline, **inputs):
function FlowMatchSFTAudioVideoLoss (line 31) | def FlowMatchSFTAudioVideoLoss(pipe: BasePipeline, **inputs):
function DirectDistillLoss (line 61) | def DirectDistillLoss(pipe: BasePipeline, **inputs):
class TrajectoryImitationLoss (line 73) | class TrajectoryImitationLoss(torch.nn.Module):
method __init__ (line 74) | def __init__(self):
method initialize (line 78) | def initialize(self, device):
method fetch_trajectory (line 83) | def fetch_trajectory(self, pipe: BasePipeline, timesteps_student, inpu...
method align_trajectory (line 100) | def align_trajectory(self, pipe: BasePipeline, timesteps_teacher, traj...
method compute_regularization (line 130) | def compute_regularization(self, pipe: BasePipeline, trajectory_teache...
method forward (line 148) | def forward(self, pipe: BasePipeline, inputs_shared, inputs_posi, inpu...
FILE: diffsynth/diffusion/parsers.py
function add_dataset_base_config (line 4) | def add_dataset_base_config(parser: argparse.ArgumentParser):
function add_image_size_config (line 12) | def add_image_size_config(parser: argparse.ArgumentParser):
function add_video_size_config (line 18) | def add_video_size_config(parser: argparse.ArgumentParser):
function add_model_config (line 25) | def add_model_config(parser: argparse.ArgumentParser):
function add_training_config (line 33) | def add_training_config(parser: argparse.ArgumentParser):
function add_output_config (line 42) | def add_output_config(parser: argparse.ArgumentParser):
function add_lora_config (line 48) | def add_lora_config(parser: argparse.ArgumentParser):
function add_gradient_config (line 57) | def add_gradient_config(parser: argparse.ArgumentParser):
function add_general_config (line 63) | def add_general_config(parser: argparse.ArgumentParser):
FILE: diffsynth/diffusion/runner.py
function launch_training_task (line 8) | def launch_training_task(
function launch_data_process_task (line 50) | def launch_data_process_task(
function initialize_deepspeed_gradient_checkpointing (line 75) | def initialize_deepspeed_gradient_checkpointing(accelerator: Accelerator):
FILE: diffsynth/diffusion/training_module.py
class GeneralUnit_RemoveCache (line 8) | class GeneralUnit_RemoveCache(PipelineUnit):
method __init__ (line 9) | def __init__(self, required_params=tuple(), force_remove_params_shared...
method process_params (line 16) | def process_params(self, inputs, required_params, force_remove_params):
method process (line 23) | def process(self, pipe, inputs_shared, inputs_posi, inputs_nega):
class DiffusionTrainingModule (line 30) | class DiffusionTrainingModule(torch.nn.Module):
method __init__ (line 31) | def __init__(self):
method to (line 35) | def to(self, *args, **kwargs):
method trainable_modules (line 41) | def trainable_modules(self):
method trainable_param_names (line 46) | def trainable_param_names(self):
method add_lora_to_model (line 52) | def add_lora_to_model(self, model, target_modules, lora_rank, lora_alp...
method mapping_lora_state_dict (line 66) | def mapping_lora_state_dict(self, state_dict):
method export_trainable_state_dict (line 77) | def export_trainable_state_dict(self, state_dict, remove_prefix=None):
method transfer_data_to_device (line 90) | def transfer_data_to_device(self, data, device, torch_float_dtype=None):
method parse_vram_config (line 110) | def parse_vram_config(self, fp8=False, offload=False, device="cpu"):
method parse_model_configs (line 137) | def parse_model_configs(self, model_paths, model_id_with_origin_paths,...
method parse_path_or_model_id (line 163) | def parse_path_or_model_id(self, model_id_with_origin_path, default_va...
method auto_detect_lora_target_modules (line 177) | def auto_detect_lora_target_modules(
method parse_lora_target_modules (line 204) | def parse_lora_target_modules(self, model, lora_target_modules):
method switch_pipe_to_training_mode (line 214) | def switch_pipe_to_training_mode(
method split_pipeline_units (line 257) | def split_pipeline_units(
method parse_extra_inputs (line 285) | def parse_extra_inputs(self, data, extra_inputs, inputs_shared):
FILE: diffsynth/models/anima_dit.py
class VideoPositionEmb (line 15) | class VideoPositionEmb(nn.Module):
method forward (line 16) | def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor...
method generate_embeddings (line 25) | def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torc...
function normalize (line 29) | def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: flo...
class LearnablePosEmbAxis (line 48) | class LearnablePosEmbAxis(VideoPositionEmb):
method __init__ (line 49) | def __init__(
method generate_embeddings (line 74) | def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torc...
class VideoRopePosition3DEmb (line 92) | class VideoRopePosition3DEmb(VideoPositionEmb):
method __init__ (line 93) | def __init__(
method generate_embeddings (line 135) | def generate_embeddings(
function apply_rotary_pos_emb (line 201) | def apply_rotary_pos_emb(
class GPT2FeedForward (line 212) | class GPT2FeedForward(nn.Module):
method __init__ (line 213) | def __init__(self, d_model: int, d_ff: int, device=None, dtype=None, o...
method forward (line 223) | def forward(self, x: torch.Tensor) -> torch.Tensor:
function torch_attention_op (line 231) | def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor,...
class Attention (line 261) | class Attention(nn.Module):
method __init__ (line 293) | def __init__(
method compute_qkv (line 337) | def compute_qkv(
method compute_attention (line 367) | def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch...
method forward (line 371) | def forward(
class Timesteps (line 387) | class Timesteps(nn.Module):
method __init__ (line 388) | def __init__(self, num_channels: int):
method forward (line 392) | def forward(self, timesteps_B_T: torch.Tensor) -> torch.Tensor:
class TimestepEmbedding (line 409) | class TimestepEmbedding(nn.Module):
method __init__ (line 410) | def __init__(self, in_features: int, out_features: int, use_adaln_lora...
method forward (line 425) | def forward(self, sample: torch.Tensor) -> Tuple[torch.Tensor, Optiona...
class PatchEmbed (line 440) | class PatchEmbed(nn.Module):
method __init__ (line 455) | def __init__(
method forward (line 480) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class FinalLayer (line 505) | class FinalLayer(nn.Module):
method __init__ (line 510) | def __init__(
method forward (line 540) | def forward(
class Block (line 571) | class Block(nn.Module):
method __init__ (line 592) | def __init__(
method forward (line 639) | def forward(
class MiniTrainDIT (line 756) | class MiniTrainDIT(nn.Module):
method __init__ (line 791) | def __init__(
method build_pos_embed (line 901) | def build_pos_embed(self, device=None, dtype=None) -> None:
method prepare_embedded_sequence (line 938) | def prepare_embedded_sequence(
method unpatchify (line 991) | def unpatchify(self, x_B_T_H_W_M: torch.Tensor) -> torch.Tensor:
method pad_to_patch_size (line 1001) | def pad_to_patch_size(self, img, patch_size=(2, 2), padding_mode="circ...
method forward (line 1011) | def forward(
function rotate_half (line 1086) | def rotate_half(x):
function apply_rotary_pos_emb2 (line 1092) | def apply_rotary_pos_emb2(x, cos, sin, unsqueeze_dim=1):
class RotaryEmbedding (line 1099) | class RotaryEmbedding(nn.Module):
method __init__ (line 1100) | def __init__(self, head_dim):
method forward (line 1107) | def forward(self, x, position_ids):
class LLMAdapterAttention (line 1121) | class LLMAdapterAttention(nn.Module):
method __init__ (line 1122) | def __init__(self, query_dim, context_dim, n_heads, head_dim, device=N...
method forward (line 1141) | def forward(self, x, mask=None, context=None, position_embeddings=None...
method init_weights (line 1165) | def init_weights(self):
class LLMAdapterTransformerBlock (line 1169) | class LLMAdapterTransformerBlock(nn.Module):
method __init__ (line 1170) | def __init__(self, source_dim, model_dim, num_heads=16, mlp_ratio=4.0,...
method forward (line 1204) | def forward(self, x, context, target_attention_mask=None, source_atten...
method init_weights (line 1217) | def init_weights(self):
class LLMAdapter (line 1222) | class LLMAdapter(nn.Module):
method __init__ (line 1223) | def __init__(
method forward (line 1250) | def forward(self, source_hidden_states, target_input_ids, target_atten...
class AnimaDiT (line 1272) | class AnimaDiT(MiniTrainDIT):
method __init__ (line 1273) | def __init__(self):
method preprocess_text_embeds (line 1278) | def preprocess_text_embeds(self, text_embeds, text_ids, t5xxl_weights=...
method forward (line 1290) | def forward(
FILE: diffsynth/models/dinov3_image_encoder.py
class DINOv3ImageEncoder (line 8) | class DINOv3ImageEncoder(DINOv3ViTModel):
method __init__ (line 9) | def __init__(self):
method forward (line 75) | def forward(self, image, torch_dtype=torch.bfloat16, device=get_device...
FILE: diffsynth/models/flux2_dit.py
function get_timestep_embedding (line 12) | def get_timestep_embedding(
class TimestepEmbedding (line 66) | class TimestepEmbedding(nn.Module):
method __init__ (line 67) | def __init__(
method forward (line 97) | def forward(self, sample, condition=None):
class Timesteps (line 112) | class Timesteps(nn.Module):
method __init__ (line 113) | def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale...
method forward (line 120) | def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
class AdaLayerNormContinuous (line 131) | class AdaLayerNormContinuous(nn.Module):
method __init__ (line 146) | def __init__(
method forward (line 166) | def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tenso...
function get_1d_rotary_pos_embed (line 174) | def get_1d_rotary_pos_embed(
function apply_rotary_emb (line 241) | def apply_rotary_emb(
function _get_projections (line 297) | def _get_projections(attn: "Flux2Attention", hidden_states, encoder_hidd...
function _get_fused_projections (line 311) | def _get_fused_projections(attn: "Flux2Attention", hidden_states, encode...
function _get_qkv_projections (line 321) | def _get_qkv_projections(attn: "Flux2Attention", hidden_states, encoder_...
class Flux2SwiGLU (line 325) | class Flux2SwiGLU(nn.Module):
method __init__ (line 331) | def __init__(self):
method forward (line 335) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class Flux2FeedForward (line 341) | class Flux2FeedForward(nn.Module):
method __init__ (line 342) | def __init__(
method forward (line 360) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class Flux2AttnProcessor (line 367) | class Flux2AttnProcessor:
method __init__ (line 371) | def __init__(self):
method __call__ (line 375) | def __call__(
class Flux2Attention (line 435) | class Flux2Attention(torch.nn.Module):
method __init__ (line 439) | def __init__(
method forward (line 492) | def forward(
class Flux2ParallelSelfAttnProcessor (line 505) | class Flux2ParallelSelfAttnProcessor:
method __init__ (line 509) | def __init__(self):
method __call__ (line 513) | def __call__(
class Flux2ParallelSelfAttention (line 560) | class Flux2ParallelSelfAttention(torch.nn.Module):
method __init__ (line 574) | def __init__(
method forward (line 621) | def forward(
class Flux2SingleTransformerBlock (line 633) | class Flux2SingleTransformerBlock(nn.Module):
method __init__ (line 634) | def __init__(
method forward (line 663) | def forward(
class Flux2TransformerBlock (line 702) | class Flux2TransformerBlock(nn.Module):
method __init__ (line 703) | def __init__(
method forward (line 737) | def forward(
class Flux2PosEmbed (line 795) | class Flux2PosEmbed(nn.Module):
method __init__ (line 797) | def __init__(self, theta: int, axes_dim: List[int]):
method forward (line 802) | def forward(self, ids: torch.Tensor) -> torch.Tensor:
class Flux2TimestepGuidanceEmbeddings (line 827) | class Flux2TimestepGuidanceEmbeddings(nn.Module):
method __init__ (line 828) | def __init__(
method forward (line 849) | def forward(self, timestep: torch.Tensor, guidance: torch.Tensor) -> t...
class Flux2Modulation (line 862) | class Flux2Modulation(nn.Module):
method __init__ (line 863) | def __init__(self, dim: int, mod_param_sets: int = 2, bias: bool = Fal...
method forward (line 870) | def forward(self, temb: torch.Tensor) -> Tuple[Tuple[torch.Tensor, tor...
class Flux2DiT (line 881) | class Flux2DiT(torch.nn.Module):
method __init__ (line 882) | def __init__(
method forward (line 963) | def forward(
FILE: diffsynth/models/flux2_text_encoder.py
class Flux2TextEncoder (line 4) | class Flux2TextEncoder(Mistral3ForConditionalGeneration):
method __init__ (line 5) | def __init__(self):
method forward (line 56) | def forward(self, input_ids = None, pixel_values = None, attention_mas...
FILE: diffsynth/models/flux2_vae.py
function get_activation (line 31) | def get_activation(act_fn: str) -> nn.Module:
class ResnetBlock2D (line 47) | class ResnetBlock2D(nn.Module):
method __init__ (line 78) | def __init__(
method forward (line 178) | def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *arg...
class Downsample2D (line 233) | class Downsample2D(nn.Module):
method __init__ (line 249) | def __init__(
method forward (line 296) | def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> tor...
class Upsample2D (line 315) | class Upsample2D(nn.Module):
method __init__ (line 331) | def __init__(
method forward (line 381) | def forward(self, hidden_states: torch.Tensor, output_size: Optional[i...
class Attention (line 434) | class Attention(nn.Module):
method __init__ (line 489) | def __init__(
method set_use_xla_flash_attention (line 693) | def set_use_xla_flash_attention(
method set_use_npu_flash_attention (line 726) | def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -...
method set_use_memory_efficient_attention_xformers (line 743) | def set_use_memory_efficient_attention_xformers(
method set_attention_slice (line 889) | def set_attention_slice(self, slice_size: int) -> None:
method set_processor (line 917) | def set_processor(self, processor: "AttnProcessor") -> None:
method get_processor (line 937) | def get_processor(self, return_deprecated_lora: bool = False) -> "Atte...
method forward (line 951) | def forward(
method batch_to_head_dim (line 997) | def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
method head_to_batch_dim (line 1014) | def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) ->...
method get_attention_scores (line 1041) | def get_attention_scores(
method prepare_attention_mask (line 1088) | def prepare_attention_mask(
method norm_encoder_hidden_states (line 1139) | def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tens...
method fuse_projections (line 1169) | def fuse_projections(self, fuse=True):
class AttnProcessor2_0 (line 1221) | class AttnProcessor2_0:
method __init__ (line 1226) | def __init__(self):
method __call__ (line 1230) | def __call__(
class UNetMidBlock2D (line 1314) | class UNetMidBlock2D(nn.Module):
method __init__ (line 1345) | def __init__(
method forward (line 1461) | def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Te...
class DownEncoderBlock2D (line 1475) | class DownEncoderBlock2D(nn.Module):
method __init__ (line 1476) | def __init__(
method forward (line 1539) | def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> tor...
class UpDecoderBlock2D (line 1554) | class UpDecoderBlock2D(nn.Module):
method __init__ (line 1555) | def __init__(
method forward (line 1616) | def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Te...
class Encoder (line 1626) | class Encoder(nn.Module):
method __init__ (line 1650) | def __init__(
method forward (line 1718) | def forward(self, sample: torch.Tensor) -> torch.Tensor:
class Decoder (line 1745) | class Decoder(nn.Module):
method __init__ (line 1768) | def __init__(
method forward (line 1843) | def forward(
class Flux2VAE (line 1878) | class Flux2VAE(torch.nn.Module):
method __init__ (line 1909) | def __init__(
method attn_processors (line 1987) | def attn_processors(self):
method set_attn_processor (line 2011) | def set_attn_processor(self, processor):
method _encode (line 2045) | def _encode(self, x: torch.Tensor) -> torch.Tensor:
method encode (line 2057) | def encode(
method _decode (line 2088) | def _decode(self, z: torch.Tensor, return_dict: bool = True):
method decode (line 2102) | def decode(
method blend_v (line 2136) | def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int)...
method blend_h (line 2142) | def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int)...
method _tiled_encode (line 2148) | def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
method tiled_encode (line 2196) | def tiled_encode(self, x: torch.Tensor, return_dict: bool = True):
method tiled_decode (line 2247) | def tiled_decode(self, z: torch.Tensor, return_dict: bool = True):
method forward (line 2296) | def forward(
FILE: diffsynth/models/flux_controlnet.py
function hash_state_dict_keys (line 7) | def hash_state_dict_keys(state_dict, with_shape=True):
function init_weights_on_device (line 13) | def init_weights_on_device(device = torch.device("meta"), include_buffer...
class FluxControlNet (line 61) | class FluxControlNet(torch.nn.Module):
method __init__ (line 62) | def __init__(self, disable_guidance_embedder=False, num_joint_blocks=5...
method prepare_image_ids (line 82) | def prepare_image_ids(self, latents):
method patchify (line 99) | def patchify(self, hidden_states):
method align_res_stack_to_original_blocks (line 104) | def align_res_stack_to_original_blocks(self, res_stack, num_blocks, hi...
method forward (line 112) | def forward(
method quantize (line 162) | def quantize(self):
class FluxControlNetStateDictConverter (line 265) | class FluxControlNetStateDictConverter:
method __init__ (line 266) | def __init__(self):
method from_diffusers (line 269) | def from_diffusers(self, state_dict):
method from_civitai (line 383) | def from_civitai(self, state_dict):
FILE: diffsynth/models/flux_dit.py
function interact_with_ipadapter (line 6) | def interact_with_ipadapter(hidden_states, q, ip_k, ip_v, scale=1.0):
class RoPEEmbedding (line 14) | class RoPEEmbedding(torch.nn.Module):
method __init__ (line 15) | def __init__(self, dim, theta, axes_dim):
method rope (line 22) | def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
method forward (line 38) | def forward(self, ids):
class FluxJointAttention (line 45) | class FluxJointAttention(torch.nn.Module):
method __init__ (line 46) | def __init__(self, dim_a, dim_b, num_heads, head_dim, only_out_a=False):
method apply_rope (line 65) | def apply_rope(self, xq, xk, freqs_cis):
method forward (line 72) | def forward(self, hidden_states_a, hidden_states_b, image_rotary_emb, ...
class FluxJointTransformerBlock (line 108) | class FluxJointTransformerBlock(torch.nn.Module):
method __init__ (line 109) | def __init__(self, dim, num_attention_heads):
method forward (line 131) | def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary...
class FluxSingleAttention (line 152) | class FluxSingleAttention(torch.nn.Module):
method __init__ (line 153) | def __init__(self, dim_a, dim_b, num_heads, head_dim):
method apply_rope (line 164) | def apply_rope(self, xq, xk, freqs_cis):
method forward (line 172) | def forward(self, hidden_states, image_rotary_emb):
class AdaLayerNormSingle (line 189) | class AdaLayerNormSingle(torch.nn.Module):
method __init__ (line 190) | def __init__(self, dim):
method forward (line 197) | def forward(self, x, emb):
class FluxSingleTransformerBlock (line 205) | class FluxSingleTransformerBlock(torch.nn.Module):
method __init__ (line 206) | def __init__(self, dim, num_attention_heads):
method apply_rope (line 220) | def apply_rope(self, xq, xk, freqs_cis):
method process_attention (line 228) | def process_attention(self, hidden_states, image_rotary_emb, attn_mask...
method forward (line 245) | def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary...
class AdaLayerNormContinuous (line 262) | class AdaLayerNormContinuous(torch.nn.Module):
method __init__ (line 263) | def __init__(self, dim):
method forward (line 269) | def forward(self, x, conditioning):
class FluxDiT (line 277) | class FluxDiT(torch.nn.Module):
method __init__ (line 278) | def __init__(self, disable_guidance_embedder=False, input_dim=64, num_...
method patchify (line 296) | def patchify(self, hidden_states):
method unpatchify (line 301) | def unpatchify(self, hidden_states, height, width):
method prepare_image_ids (line 306) | def prepare_image_ids(self, latents):
method construct_mask (line 323) | def construct_mask(self, entity_masks, prompt_seq_len, image_seq_len):
method process_entity_masks (line 358) | def process_entity_masks(self, hidden_states, prompt_emb, entity_promp...
method forward (line 386) | def forward(
FILE: diffsynth/models/flux_infiniteyou.py
function FeedForward (line 7) | def FeedForward(dim, mult=4):
function reshape_tensor (line 17) | def reshape_tensor(x, heads):
class PerceiverAttention (line 28) | class PerceiverAttention(nn.Module):
method __init__ (line 30) | def __init__(self, *, dim, dim_head=64, heads=8):
method forward (line 44) | def forward(self, x, latents):
class InfiniteYouImageProjector (line 76) | class InfiniteYouImageProjector(nn.Module):
method __init__ (line 78) | def __init__(
method forward (line 104) | def forward(self, x):
method state_dict_converter (line 119) | def state_dict_converter():
class FluxInfiniteYouImageProjectorStateDictConverter (line 123) | class FluxInfiniteYouImageProjectorStateDictConverter:
method __init__ (line 125) | def __init__(self):
method from_diffusers (line 128) | def from_diffusers(self, state_dict):
FILE: diffsynth/models/flux_ipadapter.py
class SiglipVisionModelSO400M (line 6) | class SiglipVisionModelSO400M(SiglipVisionModel):
method __init__ (line 7) | def __init__(self):
class MLPProjModel (line 23) | class MLPProjModel(torch.nn.Module):
method __init__ (line 24) | def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num...
method forward (line 37) | def forward(self, id_embeds):
class IpAdapterModule (line 43) | class IpAdapterModule(torch.nn.Module):
method __init__ (line 44) | def __init__(self, num_attention_heads, attention_head_dim, input_dim):
method forward (line 54) | def forward(self, hidden_states):
class FluxIpAdapter (line 66) | class FluxIpAdapter(torch.nn.Module):
method __init__ (line 67) | def __init__(self, num_attention_heads=24, attention_head_dim=128, cro...
method set_adapter (line 73) | def set_adapter(self):
method forward (line 76) | def forward(self, hidden_states, scale=1.0):
method state_dict_converter (line 91) | def state_dict_converter():
class FluxIpAdapterStateDictConverter (line 95) | class FluxIpAdapterStateDictConverter:
method __init__ (line 96) | def __init__(self):
method from_diffusers (line 99) | def from_diffusers(self, state_dict):
method from_civitai (line 109) | def from_civitai(self, state_dict):
FILE: diffsynth/models/flux_lora_encoder.py
function low_version_attention (line 5) | def low_version_attention(query, key, value, attn_bias=None):
class Attention (line 15) | class Attention(torch.nn.Module):
method __init__ (line 17) | def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=Fal...
method interact_with_ipadapter (line 29) | def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=...
method torch_forward (line 37) | def torch_forward(self, hidden_states, encoder_hidden_states=None, att...
method xformers_forward (line 64) | def xformers_forward(self, hidden_states, encoder_hidden_states=None, ...
method forward (line 88) | def forward(self, hidden_states, encoder_hidden_states=None, attn_mask...
class CLIPEncoderLayer (line 95) | class CLIPEncoderLayer(torch.nn.Module):
method __init__ (line 96) | def __init__(self, embed_dim, intermediate_size, num_heads=12, head_di...
method quickGELU (line 106) | def quickGELU(self, x):
method forward (line 109) | def forward(self, hidden_states, attn_mask=None):
class SDTextEncoder (line 129) | class SDTextEncoder(torch.nn.Module):
method __init__ (line 130) | def __init__(self, embed_dim=768, vocab_size=49408, max_position_embed...
method attention_mask (line 148) | def attention_mask(self, length):
method forward (line 154) | def forward(self, input_ids, clip_skip=1):
method state_dict_converter (line 165) | def state_dict_converter():
class SDTextEncoderStateDictConverter (line 169) | class SDTextEncoderStateDictConverter:
method __init__ (line 170) | def __init__(self):
method from_diffusers (line 173) | def from_diffusers(self, state_dict):
method from_civitai (line 205) | def from_civitai(self, state_dict):
class LoRALayerBlock (line 415) | class LoRALayerBlock(torch.nn.Module):
method __init__ (line 416) | def __init__(self, L, dim_in, dim_out):
method forward (line 421) | def forward(self, lora_A, lora_B):
class LoRAEmbedder (line 427) | class LoRAEmbedder(torch.nn.Module):
method __init__ (line 428) | def __init__(self, lora_patterns=None, L=1, out_dim=2048):
method default_lora_patterns (line 449) | def default_lora_patterns(self):
method forward (line 472) | def forward(self, lora):
class FluxLoRAEncoder (line 485) | class FluxLoRAEncoder(torch.nn.Module):
method __init__ (line 486) | def __init__(self, embed_dim=4096, encoder_intermediate_size=8192, num...
method forward (line 503) | def forward(self, lora):
method state_dict_converter (line 515) | def state_dict_converter():
class FluxLoRAEncoderStateDictConverter (line 519) | class FluxLoRAEncoderStateDictConverter:
method from_civitai (line 520) | def from_civitai(self, state_dict):
FILE: diffsynth/models/flux_lora_patcher.py
class GeneralLoRALoader (line 5) | class GeneralLoRALoader:
method __init__ (line 6) | def __init__(self, device="cpu", torch_dtype=torch.float32):
method get_name_dict (line 11) | def get_name_dict(self, lora_state_dict):
method load (line 28) | def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
class FluxLoRALoader (line 47) | class FluxLoRALoader(GeneralLoRALoader):
method __init__ (line 48) | def __init__(self, device="cpu", torch_dtype=torch.float32):
method load (line 123) | def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
method convert_state_dict (line 127) | def convert_state_dict(self,state_dict):
class LoraMerger (line 250) | class LoraMerger(torch.nn.Module):
method __init__ (line 251) | def __init__(self, dim):
method forward (line 262) | def forward(self, base_output, lora_outputs):
class FluxLoraPatcher (line 273) | class FluxLoraPatcher(torch.nn.Module):
method __init__ (line 274) | def __init__(self, lora_patterns=None):
method default_lora_patterns (line 284) | def default_lora_patterns(self):
method forward (line 305) | def forward(self, base_output, lora_outputs, name):
FILE: diffsynth/models/flux_text_encoder_clip.py
class Attention (line 4) | class Attention(torch.nn.Module):
method __init__ (line 6) | def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=Fal...
method forward (line 18) | def forward(self, hidden_states, encoder_hidden_states=None, attn_mask...
class CLIPEncoderLayer (line 41) | class CLIPEncoderLayer(torch.nn.Module):
method __init__ (line 42) | def __init__(self, embed_dim, intermediate_size, num_heads=12, head_di...
method quickGELU (line 52) | def quickGELU(self, x):
method forward (line 55) | def forward(self, hidden_states, attn_mask=None):
class FluxTextEncoderClip (line 75) | class FluxTextEncoderClip(torch.nn.Module):
method __init__ (line 76) | def __init__(self, embed_dim=768, vocab_size=49408, max_position_embed...
method attention_mask (line 94) | def attention_mask(self, length):
method forward (line 100) | def forward(self, input_ids, clip_skip=2, extra_mask=None):
FILE: diffsynth/models/flux_text_encoder_t5.py
class FluxTextEncoderT5 (line 5) | class FluxTextEncoderT5(T5EncoderModel):
method __init__ (line 6) | def __init__(self):
method forward (line 40) | def forward(self, input_ids):
FILE: diffsynth/models/flux_vae.py
class TileWorker (line 5) | class TileWorker:
method __init__ (line 6) | def __init__(self):
method mask (line 10) | def mask(self, height, width, border_width):
method tile (line 20) | def tile(self, model_input, tile_size, tile_stride, tile_device, tile_...
method tiled_inference (line 34) | def tiled_inference(self, forward_fn, model_input, tile_batch_size, in...
method io_scale (line 57) | def io_scale(self, model_output, tile_size):
method untile (line 64) | def untile(self, model_output, height, width, tile_size, tile_stride, ...
method tiled_forward (line 83) | def tiled_forward(self, forward_fn, model_input, tile_size, tile_strid...
class ConvAttention (line 109) | class ConvAttention(torch.nn.Module):
method __init__ (line 111) | def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=Fal...
method forward (line 123) | def forward(self, hidden_states, encoder_hidden_states=None, attn_mask...
class Attention (line 153) | class Attention(torch.nn.Module):
method __init__ (line 155) | def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=Fal...
method forward (line 167) | def forward(self, hidden_states, encoder_hidden_states=None, attn_mask...
class VAEAttentionBlock (line 190) | class VAEAttentionBlock(torch.nn.Module):
method __init__ (line 192) | def __init__(self, num_attention_heads, attention_head_dim, in_channel...
method forward (line 223) | def forward(self, hidden_states, time_emb, text_emb, res_stack):
class ResnetBlock (line 240) | class ResnetBlock(torch.nn.Module):
method __init__ (line 241) | def __init__(self, in_channels, out_channels, temb_channels=None, grou...
method forward (line 254) | def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwar...
class UpSampler (line 272) | class UpSampler(torch.nn.Module):
method __init__ (line 273) | def __init__(self, channels):
method forward (line 277) | def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwar...
class DownSampler (line 283) | class DownSampler(torch.nn.Module):
method __init__ (line 284) | def __init__(self, channels, padding=1, extra_padding=False):
method forward (line 289) | def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwar...
class FluxVAEDecoder (line 296) | class FluxVAEDecoder(torch.nn.Module):
method __init__ (line 297) | def __init__(self, use_conv_attention=True):
method tiled_forward (line 333) | def tiled_forward(self, sample, tile_size=64, tile_stride=32):
method forward (line 344) | def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, *...
class FluxVAEEncoder (line 368) | class FluxVAEEncoder(torch.nn.Module):
method __init__ (line 369) | def __init__(self, use_conv_attention=True):
method tiled_forward (line 401) | def tiled_forward(self, sample, tile_size=64, tile_stride=32):
method forward (line 412) | def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, *...
method encode_video (line 436) | def encode_video(self, sample, batch_size=8):
FILE: diffsynth/models/flux_value_control.py
class MultiValueEncoder (line 5) | class MultiValueEncoder(torch.nn.Module):
method __init__ (line 6) | def __init__(self, encoders=()):
method __call__ (line 12) | def __call__(self, values, dtype):
class SingleValueEncoder (line 22) | class SingleValueEncoder(torch.nn.Module):
method __init__ (line 23) | def __init__(self, dim_in=256, dim_out=4096, prefer_len=32, computatio...
method forward (line 34) | def forward(self, value, dtype):
method state_dict_converter (line 44) | def state_dict_converter():
class SingleValueEncoderStateDictConverter (line 48) | class SingleValueEncoderStateDictConverter:
method __init__ (line 49) | def __init__(self):
method from_diffusers (line 52) | def from_diffusers(self, state_dict):
method from_civitai (line 55) | def from_civitai(self, state_dict):
FILE: diffsynth/models/general_modules.py
function get_timestep_embedding (line 4) | def get_timestep_embedding(
class TemporalTimesteps (line 43) | class TemporalTimesteps(torch.nn.Module):
method __init__ (line 44) | def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale...
method forward (line 53) | def forward(self, timesteps):
class DiffusersCompatibleTimestepProj (line 66) | class DiffusersCompatibleTimestepProj(torch.nn.Module):
method __init__ (line 67) | def __init__(self, dim_in, dim_out):
method forward (line 73) | def forward(self, x):
class TimestepEmbeddings (line 80) | class TimestepEmbeddings(torch.nn.Module):
method __init__ (line 81) | def __init__(self, dim_in, dim_out, computation_device=None, diffusers...
method forward (line 94) | def forward(self, timestep, dtype, addition_t_cond=None):
class RMSNorm (line 104) | class RMSNorm(torch.nn.Module):
method __init__ (line 105) | def __init__(self, dim, eps, elementwise_affine=True):
method forward (line 113) | def forward(self, hidden_states):
class AdaLayerNorm (line 123) | class AdaLayerNorm(torch.nn.Module):
method __init__ (line 124) | def __init__(self, dim, single=False, dual=False):
method forward (line 131) | def forward(self, x, emb):
FILE: diffsynth/models/longcat_video_dit.py
class RMSNorm_FP32 (line 16) | class RMSNorm_FP32(torch.nn.Module):
method __init__ (line 17) | def __init__(self, dim: int, eps: float):
method _norm (line 22) | def _norm(self, x):
method forward (line 25) | def forward(self, x):
function broadcat (line 30) | def broadcat(tensors, dim=-1):
function rotate_half (line 49) | def rotate_half(x):
class RotaryPositionalEmbedding (line 56) | class RotaryPositionalEmbedding(nn.Module):
method __init__ (line 58) | def __init__(self,
method register_grid_size (line 77) | def register_grid_size(self, grid_size):
method precompute_freqs_cis_3d (line 83) | def precompute_freqs_cis_3d(self, grid_size):
method forward (line 114) | def forward(self, q, k, grid_size):
class Attention (line 138) | class Attention(nn.Module):
method __init__ (line 139) | def __init__(
method _process_attn (line 173) | def _process_attn(self, q, k, v, shape):
method forward (line 181) | def forward(self, x: torch.Tensor, shape=None, num_cond_latents=None, ...
method forward_with_kv_cache (line 223) | def forward_with_kv_cache(self, x: torch.Tensor, shape=None, num_cond_...
class MultiHeadCrossAttention (line 258) | class MultiHeadCrossAttention(nn.Module):
method __init__ (line 259) | def __init__(
method _process_cross_attn (line 285) | def _process_cross_attn(self, x, cond, kv_seqlen):
method forward (line 304) | def forward(self, x, cond, kv_seqlen, num_cond_latents=None, shape=None):
class LayerNorm_FP32 (line 328) | class LayerNorm_FP32(nn.LayerNorm):
method __init__ (line 329) | def __init__(self, dim, eps, elementwise_affine):
method forward (line 332) | def forward(self, inputs: torch.Tensor) -> torch.Tensor:
function modulate_fp32 (line 344) | def modulate_fp32(norm_func, x, shift, scale):
class FinalLayer_FP32 (line 355) | class FinalLayer_FP32(nn.Module):
method __init__ (line 360) | def __init__(self, hidden_size, num_patch, out_channels, adaln_tembed_...
method forward (line 371) | def forward(self, x, t, latent_shape):
class FeedForwardSwiGLU (line 384) | class FeedForwardSwiGLU(nn.Module):
method __init__ (line 385) | def __init__(
method forward (line 405) | def forward(self, x):
class TimestepEmbedder (line 409) | class TimestepEmbedder(nn.Module):
method __init__ (line 414) | def __init__(self, t_embed_dim, frequency_embedding_size=256):
method timestep_embedding (line 425) | def timestep_embedding(t, dim, max_period=10000):
method forward (line 443) | def forward(self, t, dtype):
class CaptionEmbedder (line 451) | class CaptionEmbedder(nn.Module):
method __init__ (line 456) | def __init__(self, in_channels, hidden_size):
method forward (line 466) | def forward(self, caption):
class PatchEmbed3D (line 472) | class PatchEmbed3D(nn.Module):
method __init__ (line 482) | def __init__(
method forward (line 503) | def forward(self, x):
class LongCatSingleStreamBlock (line 526) | class LongCatSingleStreamBlock(nn.Module):
method __init__ (line 527) | def __init__(
method forward (line 573) | def forward(self, x, y, t, y_seqlen, latent_shape, num_cond_latents=No...
class LongCatVideoTransformer3DModel (line 629) | class LongCatVideoTransformer3DModel(torch.nn.Module):
method __init__ (line 630) | def __init__(
method enable_loras (line 697) | def enable_loras(self, lora_key_list=[]):
method _create_multi_lora_forward (line 720) | def _create_multi_lora_forward(self, module, loras):
method _get_module_by_name (line 737) | def _get_module_by_name(self, module_name):
method disable_all_loras (line 746) | def disable_all_loras(self):
method enable_bsa (line 758) | def enable_bsa(self,):
method disable_bsa (line 762) | def disable_bsa(self,):
method forward (line 766) | def forward(
method unpatchify (line 866) | def unpatchify(self, x, N_t, N_h, N_w):
method state_dict_converter (line 889) | def state_dict_converter():
class LongCatVideoTransformer3DModelDictConverter (line 893) | class LongCatVideoTransformer3DModelDictConverter:
method __init__ (line 894) | def __init__(self):
method from_diffusers (line 897) | def from_diffusers(self, state_dict):
method from_civitai (line 900) | def from_civitai(self, state_dict):
FILE: diffsynth/models/ltx2_audio_vae.py
class AudioProcessor (line 12) | class AudioProcessor(nn.Module):
method __init__ (line 15) | def __init__(
method resample_waveform (line 40) | def resample_waveform(
method waveform_to_mel (line 52) | def waveform_to_mel(
class AudioPatchifier (line 67) | class AudioPatchifier(Patchifier):
method __init__ (line 68) | def __init__(
method patch_size (line 103) | def patch_size(self) -> Tuple[int, int, int]:
method get_token_count (line 106) | def get_token_count(self, tgt_shape: AudioLatentShape) -> int:
method _get_audio_latent_time_in_sec (line 109) | def _get_audio_latent_time_in_sec(
method _compute_audio_timings (line 144) | def _compute_audio_timings(
method patchify (line 180) | def patchify(
method unpatchify (line 201) | def unpatchify(
method unpatchify_audio (line 227) | def unpatchify_audio(
method get_patch_grid_bounds (line 241) | def get_patch_grid_bounds(
class AttentionType (line 263) | class AttentionType(Enum):
class AttnBlock (line 271) | class AttnBlock(torch.nn.Module):
method __init__ (line 272) | def __init__(
method forward (line 286) | def forward(self, x: torch.Tensor) -> torch.Tensor:
function make_attn (line 313) | def make_attn(
class CausalityAxis (line 329) | class CausalityAxis(Enum):
class CausalConv2d (line 338) | class CausalConv2d(torch.nn.Module):
method __init__ (line 346) | def __init__(
method forward (line 392) | def forward(self, x: torch.Tensor) -> torch.Tensor:
function make_conv2d (line 398) | def make_conv2d(
class ResBlock1 (line 448) | class ResBlock1(torch.nn.Module):
method __init__ (line 449) | def __init__(self, channels: int, kernel_size: int = 3, dilation: Tupl...
method forward (line 509) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class ResBlock2 (line 519) | class ResBlock2(torch.nn.Module):
method __init__ (line 520) | def __init__(self, channels: int, kernel_size: int = 3, dilation: Tupl...
method forward (line 543) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class ResnetBlock (line 551) | class ResnetBlock(torch.nn.Module):
method __init__ (line 552) | def __init__(
method forward (line 591) | def forward(
class Downsample (line 615) | class Downsample(torch.nn.Module):
method __init__ (line 622) | def __init__(
method forward (line 640) | def forward(self, x: torch.Tensor) -> torch.Tensor:
function build_downsampling_path (line 664) | def build_downsampling_path( # noqa: PLR0913
class Upsample (line 717) | class Upsample(torch.nn.Module):
method __init__ (line 718) | def __init__(
method forward (line 730) | def forward(self, x: torch.Tensor) -> torch.Tensor:
function build_upsampling_path (line 763) | def build_upsampling_path( # noqa: PLR0913
class PerChannelStatistics (line 814) | class PerChannelStatistics(nn.Module):
method __init__ (line 820) | def __init__(self, latent_channels: int = 128) -> None:
method un_normalize (line 825) | def un_normalize(self, x: torch.Tensor) -> torch.Tensor:
method normalize (line 828) | def normalize(self, x: torch.Tensor) -> torch.Tensor:
function build_mid_block (line 835) | def build_mid_block(
function run_mid_block (line 866) | def run_mid_block(mid: torch.nn.Module, features: torch.Tensor) -> torch...
class LTX2AudioEncoder (line 873) | class LTX2AudioEncoder(torch.nn.Module):
method __init__ (line 880) | def __init__( # noqa: PLR0913
method forward (line 1003) | def forward(self, spectrogram: torch.Tensor) -> torch.Tensor:
method _run_downsampling_path (line 1018) | def _run_downsampling_path(self, h: torch.Tensor) -> torch.Tensor:
method _finalize_output (line 1031) | def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
method _normalize_latents (line 1036) | def _normalize_latents(self, latent_output: torch.Tensor) -> torch.Ten...
class LTX2AudioDecoder (line 1062) | class LTX2AudioDecoder(torch.nn.Module):
method __init__ (line 1069) | def __init__( # noqa: PLR0913
method forward (line 1170) | def forward(self, sample: torch.Tensor) -> torch.Tensor:
method _denormalize_latents (line 1187) | def _denormalize_latents(self, sample: torch.Tensor) -> tuple[torch.Te...
method _adjust_output_shape (line 1212) | def _adjust_output_shape(
method _run_upsampling_path (line 1259) | def _run_upsampling_path(self, h: torch.Tensor) -> torch.Tensor:
method _finalize_output (line 1272) | def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
function get_padding (line 1282) | def get_padding(kernel_size: int, dilation: int = 1) -> int:
function _sinc (line 1292) | def _sinc(x: torch.Tensor) -> torch.Tensor:
function kaiser_sinc_filter1d (line 1300) | def kaiser_sinc_filter1d(cutoff: float, half_width: float, kernel_size: ...
class LowPassFilter1d (line 1321) | class LowPassFilter1d(nn.Module):
method __init__ (line 1322) | def __init__(
method forward (line 1345) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class UpSample1d (line 1352) | class UpSample1d(nn.Module):
method __init__ (line 1353) | def __init__(
method forward (line 1391) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class DownSample1d (line 1399) | class DownSample1d(nn.Module):
method __init__ (line 1400) | def __init__(self, ratio: int = 2, kernel_size: int | None = None) -> ...
method forward (line 1411) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class Activation1d (line 1415) | class Activation1d(nn.Module):
method __init__ (line 1416) | def __init__(
method forward (line 1429) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class Snake (line 1435) | class Snake(nn.Module):
method __init__ (line 1436) | def __init__(
method forward (line 1449) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class SnakeBeta (line 1456) | class SnakeBeta(nn.Module):
method __init__ (line 1457) | def __init__(
method forward (line 1472) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class AMPBlock1 (line 1481) | class AMPBlock1(nn.Module):
method __init__ (line 1482) | def __init__(
method forward (line 1531) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class LTX2Vocoder (line 1541) | class LTX2Vocoder(torch.nn.Module):
method __init__ (line 1565) | def __init__( # noqa: PLR0913
method forward (line 1646) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class _STFTFn (line 1689) | class _STFTFn(nn.Module):
method __init__ (line 1697) | def __init__(self, filter_length: int, hop_length: int, win_length: in...
method forward (line 1705) | def forward(self, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
class MelSTFT (line 1727) | class MelSTFT(nn.Module):
method __init__ (line 1735) | def __init__(
method mel_spectrogram (line 1750) | def mel_spectrogram(self, y: torch.Tensor) -> tuple[torch.Tensor, torc...
class LTX2VocoderWithBWE (line 1767) | class LTX2VocoderWithBWE(nn.Module):
method __init__ (line 1775) | def __init__(
method conv_pre (line 1827) | def conv_pre(self) -> nn.Conv1d:
method conv_post (line 1831) | def conv_post(self) -> nn.Conv1d:
method _compute_mel (line 1834) | def _compute_mel(self, audio: torch.Tensor) -> torch.Tensor:
method forward (line 1846) | def forward(self, mel_spec: torch.Tensor) -> torch.Tensor:
FILE: diffsynth/models/ltx2_common.py
class VideoPixelShape (line 8) | class VideoPixelShape(NamedTuple):
class SpatioTemporalScaleFactors (line 20) | class SpatioTemporalScaleFactors(NamedTuple):
method default (line 31) | def default(cls) -> "SpatioTemporalScaleFactors":
class VideoLatentShape (line 38) | class VideoLatentShape(NamedTuple):
method to_torch_shape (line 52) | def to_torch_shape(self) -> torch.Size:
method from_torch_shape (line 56) | def from_torch_shape(shape: torch.Size) -> "VideoLatentShape":
method mask_shape (line 65) | def mask_shape(self) -> "VideoLatentShape":
method from_pixel_shape (line 69) | def from_pixel_shape(
method upscale (line 86) | def upscale(self, scale_factors: SpatioTemporalScaleFactors = VIDEO_SC...
class AudioLatentShape (line 95) | class AudioLatentShape(NamedTuple):
method to_torch_shape (line 106) | def to_torch_shape(self) -> torch.Size:
method mask_shape (line 109) | def mask_shape(self) -> "AudioLatentShape":
method from_torch_shape (line 113) | def from_torch_shape(shape: torch.Size) -> "AudioLatentShape":
method from_duration (line 122) | def from_duration(
method from_video_pixel_shape (line 141) | def from_video_pixel_shape(
class LatentState (line 161) | class LatentState:
method clone (line 176) | def clone(self) -> "LatentState":
class NormType (line 185) | class NormType(Enum):
class PixelNorm (line 192) | class PixelNorm(nn.Module):
method __init__ (line 200) | def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
method forward (line 210) | def forward(self, x: torch.Tensor) -> torch.Tensor:
function build_normalization_layer (line 221) | def build_normalization_layer(
function rms_norm (line 240) | def rms_norm(x: torch.Tensor, weight: torch.Tensor | None = None, eps: f...
class Modality (line 249) | class Modality:
function to_denoised (line 285) | def to_denoised(
class Patchifier (line 302) | class Patchifier(Protocol):
method patchify (line 307) | def patchify(
method unpatchify (line 320) | def unpatchify(
method patch_size (line 336) | def patch_size(self) -> Tuple[int, int, int]:
method get_patch_grid_bounds (line 342) | def get_patch_grid_bounds(
function get_pixel_coords (line 359) | def get_pixel_coords(
FILE: diffsynth/models/ltx2_dit.py
function get_timestep_embedding (line 14) | def get_timestep_embedding(
class TimestepEmbedding (line 65) | class TimestepEmbedding(torch.nn.Module):
method __init__ (line 66) | def __init__(
method forward (line 92) | def forward(self, sample: torch.Tensor, condition: torch.Tensor | None...
class Timesteps (line 107) | class Timesteps(torch.nn.Module):
method __init__ (line 108) | def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale...
method forward (line 115) | def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
class PixArtAlphaCombinedTimestepSizeEmbeddings (line 126) | class PixArtAlphaCombinedTimestepSizeEmbeddings(torch.nn.Module):
method __init__ (line 133) | def __init__(
method forward (line 144) | def forward(
class PerturbationType (line 154) | class PerturbationType(Enum):
class Perturbation (line 164) | class Perturbation:
method is_perturbed (line 170) | def is_perturbed(self, perturbation_type: PerturbationType, block: int...
class PerturbationConfig (line 181) | class PerturbationConfig:
method is_perturbed (line 186) | def is_perturbed(self, perturbation_type: PerturbationType, block: int...
method empty (line 193) | def empty() -> "PerturbationConfig":
class BatchedPerturbationConfig (line 198) | class BatchedPerturbationConfig:
method mask (line 203) | def mask(
method mask_like (line 213) | def mask_like(self, perturbation_type: PerturbationType, block: int, v...
method any_in_batch (line 217) | def any_in_batch(self, perturbation_type: PerturbationType, block: int...
method all_in_batch (line 220) | def all_in_batch(self, perturbation_type: PerturbationType, block: int...
method empty (line 224) | def empty(batch_size: int) -> "BatchedPerturbationConfig":
function adaln_embedding_coefficient (line 234) | def adaln_embedding_coefficient(cross_attention_adaln: bool) -> int:
class AdaLayerNormSingle (line 239) | class AdaLayerNormSingle(torch.nn.Module):
method __init__ (line 248) | def __init__(self, embedding_dim: int, embedding_coefficient: int = 6):
method forward (line 259) | def forward(
class LTXRopeType (line 268) | class LTXRopeType(Enum):
function apply_rotary_emb (line 273) | def apply_rotary_emb(
function apply_interleaved_rotary_emb (line 287) | def apply_interleaved_rotary_emb(
function apply_split_rotary_emb (line 300) | def apply_split_rotary_emb(
function generate_freq_grid_np (line 328) | def generate_freq_grid_np(
function generate_freq_grid_pytorch (line 349) | def generate_freq_grid_pytorch(
function get_fractional_positions (line 372) | def get_fractional_positions(indices_grid: torch.Tensor, max_pos: list[i...
function generate_freqs (line 384) | def generate_freqs(
function split_freqs_cis (line 402) | def split_freqs_cis(freqs: torch.Tensor, pad_size: int, num_attention_he...
function interleaved_freqs_cis (line 425) | def interleaved_freqs_cis(freqs: torch.Tensor, pad_size: int) -> tuple[t...
function precompute_freqs_cis (line 436) | def precompute_freqs_cis(
class Attention (line 465) | class Attention(torch.nn.Module):
method __init__ (line 466) | def __init__(
method forward (line 500) | def forward(
class PixArtAlphaTextProjection (line 556) | class PixArtAlphaTextProjection(torch.nn.Module):
method __init__ (line 562) | def __init__(self, in_features: int, hidden_size: int, out_features: i...
method forward (line 575) | def forward(self, caption: torch.Tensor) -> torch.Tensor:
class TransformerArgs (line 582) | class TransformerArgs:
class TransformerArgsPreprocessor (line 599) | class TransformerArgsPreprocessor:
method __init__ (line 600) | def __init__( # noqa: PLR0913
method _prepare_timestep (line 628) | def _prepare_timestep(
method _prepare_context (line 642) | def _prepare_context(
method _prepare_attention_mask (line 653) | def _prepare_attention_mask(self, attention_mask: torch.Tensor | None,...
method _prepare_self_attention_mask (line 662) | def _prepare_self_attention_mask(
method _prepare_positional_embeddings (line 690) | def _prepare_positional_embeddings(
method prepare (line 714) | def prepare(
class MultiModalTransformerArgsPreprocessor (line 756) | class MultiModalTransformerArgsPreprocessor:
method __init__ (line 757) | def __init__( # noqa: PLR0913
method prepare (line 797) | def prepare(
method _prepare_cross_attention_timestep (line 839) | def _prepare_cross_attention_timestep(
class TransformerConfig (line 866) | class TransformerConfig:
class BasicAVTransformerBlock (line 875) | class BasicAVTransformerBlock(torch.nn.Module):
method __init__ (line 876) | def __init__(
method get_ada_values (line 970) | def get_ada_values(
method get_av_ca_ada_values (line 981) | def get_av_ca_ada_values(
method _apply_text_cross_attention (line 1002) | def _apply_text_cross_attention(
method forward (line 1031) | def forward( # noqa: PLR0915
function apply_cross_attention_adaln (line 1223) | def apply_cross_attention_adaln(
class GELUApprox (line 1245) | class GELUApprox(torch.nn.Module):
method __init__ (line 1246) | def __init__(self, dim_in: int, dim_out: int) -> None:
method forward (line 1250) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class FeedForward (line 1254) | class FeedForward(torch.nn.Module):
method __init__ (line 1255) | def __init__(self, dim: int, dim_out: int, mult: int = 4) -> None:
method forward (line 1262) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class LTXModelType (line 1266) | class LTXModelType(Enum):
method is_video_enabled (line 1271) | def is_video_enabled(self) -> bool:
method is_audio_enabled (line 1274) | def is_audio_enabled(self) -> bool:
class LTXModel (line 1278) | class LTXModel(torch.nn.Module):
method __init__ (line 1284) | def __init__( # noqa: PLR0913
method _adaln_embedding_coefficient (line 1367) | def _adaln_embedding_coefficient(self) -> int:
method _init_video (line 1370) | def _init_video(
method _init_audio (line 1395) | def _init_audio(
method _init_audio_video (line 1422) | def _init_audio_video(
method _init_preprocessors (line 1447) | def _init_preprocessors(
method _init_transformer_blocks (line 1523) | def _init_transformer_blocks(
method set_gradient_checkpointing (line 1571) | def set_gradient_checkpointing(self, enable: bool) -> None:
method _process_transformer_blocks (line 1581) | def _process_transformer_blocks(
method _process_output (line 1604) | def _process_output(
method _forward (line 1624) | def _forward(
method forward (line 1674) | def forward(self, video_latents, video_positions, video_context, video...
FILE: diffsynth/models/ltx2_text_encoder.py
class LTX2TextEncoder (line 11) | class LTX2TextEncoder(Gemma3ForConditionalGeneration):
method __init__ (line 12) | def __init__(self):
class LTXVGemmaTokenizer (line 90) | class LTXVGemmaTokenizer:
method __init__ (line 97) | def __init__(self, tokenizer_path: str, max_length: int = 1024):
method tokenize_with_weights (line 114) | def tokenize_with_weights(self, text: str, return_word_ids: bool = Fal...
class GemmaFeaturesExtractorProjLinear (line 153) | class GemmaFeaturesExtractorProjLinear(nn.Module):
method __init__ (line 163) | def __init__(self) -> None:
method forward (line 171) | def forward(
class GemmaSeperatedFeaturesExtractorProjLinear (line 185) | class GemmaSeperatedFeaturesExtractorProjLinear(nn.Module):
method __init__ (line 188) | def __init__(
method forward (line 201) | def forward(
class _BasicTransformerBlock1D (line 220) | class _BasicTransformerBlock1D(nn.Module):
method __init__ (line 221) | def __init__(
method forward (line 244) | def forward(
class Embeddings1DConnector (line 277) | class Embeddings1DConnector(nn.Module):
method __init__ (line 298) | def __init__(
method _replace_padded_with_learnable_registers (line 340) | def _replace_padded_with_learnable_registers(
method forward (line 368) | def forward(
class LTX2TextEncoderPostModules (line 406) | class LTX2TextEncoderPostModules(nn.Module):
method __init__ (line 407) | def __init__(
method create_embeddings (line 442) | def create_embeddings(
method process_hidden_states (line 454) | def process_hidden_states(
function _norm_and_concat_padded_batch (line 466) | def _norm_and_concat_padded_batch(
function _convert_to_additive_mask (line 516) | def _convert_to_additive_mask(attention_mask: torch.Tensor, dtype: torch...
function _to_binary_mask (line 520) | def _to_binary_mask(encoded: torch.Tensor, encoded_mask: torch.Tensor) -...
function norm_and_concat_per_token_rms (line 528) | def norm_and_concat_per_token_rms(
function _rescale_norm (line 547) | def _rescale_norm(x: torch.Tensor, target_dim: int, source_dim: int) -> ...
FILE: diffsynth/models/ltx2_upsampler.py
class PixelShuffleND (line 8) | class PixelShuffleND(torch.nn.Module):
method __init__ (line 28) | def __init__(self, dims: int, upscale_factors: tuple[int, int, int] = ...
method forward (line 34) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class ResBlock (line 60) | class ResBlock(torch.nn.Module):
method __init__ (line 70) | def __init__(self, channels: int, mid_channels: Optional[int] = None, ...
method forward (line 83) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class BlurDownsample (line 94) | class BlurDownsample(torch.nn.Module):
method __init__ (line 100) | def __init__(self, dims: int, stride: int, kernel_size: int = 5) -> None:
method forward (line 120) | def forward(self, x: torch.Tensor) -> torch.Tensor:
method _apply_2d (line 135) | def _apply_2d(self, x2d: torch.Tensor) -> torch.Tensor:
function _rational_for_scale (line 142) | def _rational_for_scale(scale: float) -> Tuple[int, int]:
class SpatialRationalResampler (line 149) | class SpatialRationalResampler(torch.nn.Module):
method __init__ (line 164) | def __init__(self, mid_channels: int, scale: float):
method forward (line 172) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class LTX2LatentUpsampler (line 182) | class LTX2LatentUpsampler(torch.nn.Module):
method __init__ (line 195) | def __init__(
method forward (line 252) | def forward(self, latent: torch.Tensor) -> torch.Tensor:
function upsample_video (line 299) | def upsample_video(latent: torch.Tensor, video_encoder: LTX2VideoEncoder...
FILE: diffsynth/models/ltx2_video_vae.py
class VideoLatentPatchifier (line 18) | class VideoLatentPatchifier(Patchifier):
method __init__ (line 19) | def __init__(self, patch_size: int):
method patch_size (line 28) | def patch_size(self) -> Tuple[int, int, int]:
method get_token_count (line 31) | def get_token_count(self, tgt_shape: VideoLatentShape) -> int:
method patchify (line 34) | def patchify(
method unpatchify (line 48) | def unpatchify(
method unpatchify_video (line 71) | def unpatchify_video(
method get_patch_grid_bounds (line 89) | def get_patch_grid_bounds(
class NormLayerType (line 162) | class NormLayerType(Enum):
class LogVarianceType (line 167) | class LogVarianceType(Enum):
class PaddingModeType (line 174) | class PaddingModeType(Enum):
class DualConv3d (line 181) | class DualConv3d(nn.Module):
method __init__ (line 183) | def __init__(
method reset_parameters (line 249) | def reset_parameters(self) -> None:
method forward (line 260) | def forward(
method forward_with_3d (line 271) | def forward_with_3d(self, x: torch.Tensor, skip_time_conv: bool = Fals...
method forward_with_2d (line 301) | def forward_with_2d(self, x: torch.Tensor, skip_time_conv: bool = Fals...
method weight (line 353) | def weight(self) -> torch.Tensor:
class CausalConv3d (line 357) | class CausalConv3d(nn.Module):
method __init__ (line 359) | def __init__(
method forward (line 396) | def forward(self, x: torch.Tensor, causal: bool = True) -> torch.Tensor:
method weight (line 408) | def weight(self) -> torch.Tensor:
function make_conv_nd (line 412) | def make_conv_nd( # noqa: PLR0913
function make_linear_nd (line 477) | def make_linear_nd(
function patchify (line 491) | def patchify(x: torch.Tensor, patch_size_hw: int, patch_size_t: int = 1)...
function unpatchify (line 520) | def unpatchify(x: torch.Tensor, patch_size_hw: int, patch_size_t: int = ...
class PerChannelStatistics (line 548) | class PerChannelStatistics(nn.Module):
method __init__ (line 554) | def __init__(self, latent_channels: int = 128):
method un_normalize (line 559) | def un_normalize(self, x: torch.Tensor) -> torch.Tensor:
method normalize (line 563) | def normalize(self, x: torch.Tensor) -> torch.Tensor:
class ResnetBlock3D (line 568) | class ResnetBlock3D(nn.Module):
method __init__ (line 580) | def __init__(
method _feed_spatial_noise (line 654) | def _feed_spatial_noise(
method forward (line 671) | def forward(
class UNetMidBlock3D (line 738) | class UNetMidBlock3D(nn.Module):
method __init__ (line 759) | def __init__(
method forward (line 796) | def forward(
class SpaceToDepthDownsample (line 825) | class SpaceToDepthDownsample(nn.Module):
method __init__ (line 827) | def __init__(
method forward (line 848) | def forward(
class DepthToSpaceUpsample (line 882) | class DepthToSpaceUpsample(nn.Module):
method __init__ (line 884) | def __init__(
method forward (line 908) | def forward(
function compute_trapezoidal_mask_1d (line 941) | def compute_trapezoidal_mask_1d(
class SpatialTilingConfig (line 981) | class SpatialTilingConfig:
method __post_init__ (line 991) | def __post_init__(self) -> None:
class TemporalTilingConfig (line 1005) | class TemporalTilingConfig:
method __post_init__ (line 1016) | def __post_init__(self) -> None:
class TilingConfig (line 1030) | class TilingConfig:
method default (line 1041) | def default(cls) -> "TilingConfig":
class DimensionIntervals (line 1049) | class DimensionIntervals:
class LatentIntervals (line 1065) | class LatentIntervals:
function default_split_operation (line 1080) | def default_split_operation(length: int) -> DimensionIntervals:
function default_mapping_operation (line 1087) | def default_mapping_operation(_intervals: DimensionIntervals,) -> tuple[...
class Tile (line 1094) | class Tile(NamedTuple):
method blend_mask (line 1115) | def blend_mask(self) -> torch.Tensor:
function create_tiles_from_intervals_and_mappers (line 1142) | def create_tiles_from_intervals_and_mappers(
function create_tiles (line 1172) | def create_tiles(
function _make_encoder_block (line 1188) | def _make_encoder_block(
class LTX2VideoEncoder (line 1294) | class LTX2VideoEncoder(nn.Module):
method __init__ (line 1326) | def __init__(
method forward (line 1430) | def forward(self, sample: torch.Tensor) -> torch.Tensor:
method tiled_encode_video (line 1494) | def tiled_encode_video(
method encode (line 1647) | def encode(
function _make_decoder_block (line 1670) | def _make_decoder_block(
class LTX2VideoDecoder (line 1752) | class LTX2VideoDecoder(nn.Module):
method __init__ (line 1782) | def __init__(
method forward (line 1901) | def forward(
method _prepare_tiles (line 1990) | def _prepare_tiles(
method tiled_decode (line 2022) | def tiled_decode(
method _group_tiles_by_temporal_slice (line 2112) | def _group_tiles_by_temporal_slice(self, tiles: List[Tile]) -> List[Li...
method _accumulate_temporal_group_into_buffer (line 2136) | def _accumulate_temporal_group_into_buffer(
method decode (line 2182) | def decode(
function decode_video (line 2207) | def decode_video(
function get_video_chunks_number (line 2237) | def get_video_chunks_number(num_frames: int, tiling_config: TilingConfig...
function split_in_spatial (line 2253) | def split_in_spatial(size: int, overlap: int) -> SplitOperation:
function split_in_temporal (line 2269) | def split_in_temporal(size: int, overlap: int) -> SplitOperation:
function to_mapping_operation (line 2285) | def to_mapping_operation(
function map_temporal_slice (line 2307) | def map_temporal_slice(begin: int, end: int, left_ramp: int, right_ramp:...
function map_spatial_slice (line 2316) | def map_spatial_slice(begin: int, end: int, left_ramp: int, right_ramp: ...
FILE: diffsynth/models/model_loader.py
class ModelPool (line 7) | class ModelPool:
method __init__ (line 8) | def __init__(self):
method import_model_class (line 13) | def import_model_class(self, model_class):
method need_to_enable_vram_management (line 19) | def need_to_enable_vram_management(self, vram_config):
method fetch_module_map (line 22) | def fetch_module_map(self, model_class, vram_config):
method load_model_file (line 33) | def load_model_file(self, config, path, vram_config, vram_limit=None, ...
method default_vram_config (line 51) | def default_vram_config(self):
method auto_load_model (line 64) | def auto_load_model(self, path, vram_config=None, vram_limit=None, cle...
method fetch_model (line 84) | def fetch_model(self, model_name, index=None):
method clear_parameters (line 109) | def clear_parameters(self, model: torch.nn.Module):
FILE: diffsynth/models/mova_audio_dit.py
function precompute_freqs_cis_1d (line 7) | def precompute_freqs_cis_1d(dim: int, end: int = 16384, theta: float = 1...
class MovaAudioDit (line 11) | class MovaAudioDit(WanModel):
method __init__ (line 12) | def __init__(self, *args, **kwargs):
method precompute_freqs_cis (line 20) | def precompute_freqs_cis(self, dim: int, end: int = 16384, theta: floa...
method forward (line 23) | def forward(self,
method unpatchify (line 52) | def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
FILE: diffsynth/models/mova_audio_vae.py
function WNConv1d (line 10) | def WNConv1d(*args, **kwargs):
function WNConvTranspose1d (line 14) | def WNConvTranspose1d(*args, **kwargs):
function snake (line 20) | def snake(x, alpha):
class Snake1d (line 28) | class Snake1d(nn.Module):
method __init__ (line 29) | def __init__(self, channels):
method forward (line 33) | def forward(self, x):
class VectorQuantize (line 37) | class VectorQuantize(nn.Module):
method __init__ (line 49) | def __init__(self, input_dim: int, codebook_size: int, codebook_dim: i...
method forward (line 58) | def forward(self, z):
method embed_code (line 96) | def embed_code(self, embed_id):
method decode_code (line 99) | def decode_code(self, embed_id):
method decode_latents (line 102) | def decode_latents(self, latents):
class ResidualVectorQuantize (line 121) | class ResidualVectorQuantize(nn.Module):
method __init__ (line 127) | def __init__(
method forward (line 151) | def forward(self, z, n_quantizers: int = None):
method from_codes (line 224) | def from_codes(self, codes: torch.Tensor):
method from_latents (line 246) | def from_latents(self, latents: torch.Tensor):
class AbstractDistribution (line 282) | class AbstractDistribution:
method sample (line 283) | def sample(self):
method mode (line 286) | def mode(self):
class DiracDistribution (line 290) | class DiracDistribution(AbstractDistribution):
method __init__ (line 291) | def __init__(self, value):
method sample (line 294) | def sample(self):
method mode (line 297) | def mode(self):
class DiagonalGaussianDistribution (line 301) | class DiagonalGaussianDistribution(object):
method __init__ (line 302) | def __init__(self, parameters, deterministic=False):
method sample (line 312) | def sample(self):
method kl (line 316) | def kl(self, other=None):
method nll (line 335) | def nll(self, sample, dims=[1, 2]):
method mode (line 344) | def mode(self):
function normal_kl (line 348) | def normal_kl(mean1, logvar1, mean2, logvar2):
function init_weights (line 371) | def init_weights(m):
class ResidualUnit (line 377) | class ResidualUnit(nn.Module):
method __init__ (line 378) | def __init__(self, dim: int = 16, dilation: int = 1):
method forward (line 388) | def forward(self, x):
class EncoderBlock (line 396) | class EncoderBlock(nn.Module):
method __init__ (line 397) | def __init__(self, dim: int = 16, stride: int = 1):
method forward (line 413) | def forward(self, x):
class Encoder (line 417) | class Encoder(nn.Module):
method __init__ (line 418) | def __init__(
method forward (line 443) | def forward(self, x):
class DecoderBlock (line 447) | class DecoderBlock(nn.Module):
method __init__ (line 448) | def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: i...
method forward (line 465) | def forward(self, x):
class Decoder (line 469) | class Decoder(nn.Module):
method __init__ (line 470) | def __init__(
method forward (line 497) | def forward(self, x):
class DacVAE (line 501) | class DacVAE(nn.Module):
method __init__ (line 503) | def __init__(
method get_delay (line 564) | def get_delay(self):
method get_output_length (line 590) | def get_output_length(self, input_length):
method dtype (line 608) | def dtype(self):
method device (line 616) | def device(self):
method preprocess (line 623) | def preprocess(self, audio_data, sample_rate):
method encode (line 634) | def encode(
method decode (line 678) | def decode(self, z: torch.Tensor):
method forward (line 703) | def forward(
method remove_weight_norm (line 771) | def remove_weight_norm(self):
FILE: diffsynth/models/mova_dual_tower_bridge.py
class RotaryEmbedding (line 9) | class RotaryEmbedding(nn.Module):
method __init__ (line 12) | def __init__(self, base: float, dim: int, device=None):
method forward (line 23) | def forward(self, x, position_ids):
function rotate_half (line 37) | def rotate_half(x):
function apply_rotary_pos_emb (line 45) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
class PerFrameAttentionPooling (line 72) | class PerFrameAttentionPooling(nn.Module):
method __init__ (line 83) | def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
method forward (line 95) | def forward(self, x: torch.Tensor, grid_size: Tuple[int, int, int]) ->...
class CrossModalInteractionController (line 125) | class CrossModalInteractionController:
method __init__ (line 131) | def __init__(self, visual_layers: int = 30, audio_layers: int = 30):
method get_interaction_layers (line 136) | def get_interaction_layers(self, strategy: str = "shallow_focus") -> D...
method should_interact (line 189) | def should_interact(self, layer_idx: int, direction: str, interaction_...
class ConditionalCrossAttention (line 207) | class ConditionalCrossAttention(nn.Module):
method __init__ (line 208) | def __init__(self, dim: int, kv_dim: int, num_heads: int, eps: float =...
method forward (line 224) | def forward(self, x: torch.Tensor, y: torch.Tensor, x_freqs: Optional[...
class AdaLayerNorm (line 252) | class AdaLayerNorm(nn.Module):
method __init__ (line 265) | def __init__(
method forward (line 288) | def forward(
class ConditionalCrossAttentionBlock (line 312) | class ConditionalCrossAttentionBlock(nn.Module):
method __init__ (line 317) | def __init__(self, dim: int, kv_dim: int, num_heads: int, eps: float =...
method forward (line 326) | def forward(
class DualTowerConditionalBridge (line 350) | class DualTowerConditionalBridge(nn.Module):
method __init__ (line 354) | def __init__(self,
method build_aligned_freqs (line 410) | def build_aligned_freqs(self,
method should_interact (line 466) | def should_interact(self, layer_idx: int, direction: str) -> bool:
method apply_conditional_control (line 469) | def apply_conditional_control(
method forward (line 535) | def forward(
FILE: diffsynth/models/nexus_gen.py
class NexusGenAutoregressiveModel (line 5) | class NexusGenAutoregressiveModel(torch.nn.Module):
method __init__ (line 6) | def __init__(self, max_length=1024, max_pixels=262640):
method load_processor (line 72) | def load_processor(self, path):
method state_dict_converter (line 78) | def state_dict_converter():
method bound_image (line 81) | def bound_image(self, image, max_pixels=262640):
method get_editing_msg (line 90) | def get_editing_msg(self, instruction):
method get_generation_msg (line 96) | def get_generation_msg(self, instruction):
method forward (line 101) | def forward(self, instruction, ref_image=None, num_img_tokens=81):
method get_target_embeddings (line 116) | def get_target_embeddings(self, images, messages, processor, model, nu...
class NexusGenAutoregressiveModelStateDictConverter (line 155) | class NexusGenAutoregressiveModelStateDictConverter:
method __init__ (line 156) | def __init__(self):
method from_civitai (line 159) | def from_civitai(self, state_dict):
FILE: diffsynth/models/nexus_gen_ar_model.py
class Qwen2_5_VLCausalLMOutputWithPast (line 35) | class Qwen2_5_VLCausalLMOutputWithPast(ModelOutput):
class Qwen2_5_VLForConditionalGeneration (line 74) | class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, Gene...
method __init__ (line 79) | def __init__(self, config):
method get_input_embeddings (line 92) | def get_input_embeddings(self):
method set_input_embeddings (line 95) | def set_input_embeddings(self, value):
method get_output_embeddings (line 98) | def get_output_embeddings(self):
method set_output_embeddings (line 101) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 104) | def set_decoder(self, decoder):
method get_decoder (line 107) | def get_decoder(self):
method get_rope_index (line 110) | def get_rope_index(
method forward (line 289) | def forward(
method _sample (line 508) | def _sample(
method prepare_prefilled_image_embeds (line 725) | def prepare_prefilled_image_embeds(self, cur_image_tokens, num_img_tok...
method get_default_image_grid_thw (line 734) | def get_default_image_grid_thw(self,):
method get_num_image_tokens (line 738) | def get_num_image_tokens(self, image_grid_thw):
method _validate_model_kwargs (line 742) | def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
method prepare_image_position_ids (line 747) | def prepare_image_position_ids(self, input_ids, generation_image_grid_...
method prepare_inputs_for_generation (line 766) | def prepare_inputs_for_generation(
method _get_image_nums_and_video_nums (line 809) | def _get_image_nums_and_video_nums(
method _expand_inputs_for_generation (line 838) | def _expand_inputs_for_generation(
class Qwen2_5_VLVideosProcessorKwargs (line 936) | class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
class Qwen2_5_VLProcessorKwargs (line 940) | class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
class Qwen2_5_VLProcessor (line 950) | class Qwen2_5_VLProcessor(ProcessorMixin):
method __init__ (line 970) | def __init__(self, image_processor=None, tokenizer=None, chat_template...
method __call__ (line 975) | def __call__(
method batch_decode (line 1083) | def batch_decode(self, *args, **kwargs):
method batch_decode_all2all (line 1090) | def batch_decode_all2all(self, *args, **kwargs):
method decode (line 1101) | def decode(self, *args, **kwargs):
method post_process_image_text_to_text (line 1108) | def post_process_image_text_to_text(
method model_input_names (line 1136) | def model_input_names(self):
FILE: diffsynth/models/nexus_gen_projector.py
function rotate_half (line 8) | def rotate_half(x):
function apply_multimodal_rotary_pos_emb (line 15) | def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqu...
class Qwen2_5_VLRotaryEmbedding (line 29) | class Qwen2_5_VLRotaryEmbedding(nn.Module):
method __init__ (line 30) | def __init__(self, config, device=None):
method _dynamic_frequency_update (line 49) | def _dynamic_frequency_update(self, position_ids, device):
method forward (line 69) | def forward(self, x, position_ids):
function repeat_kv (line 93) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
class Qwen2_5_VLAttention (line 105) | class Qwen2_5_VLAttention(nn.Module):
method __init__ (line 106) | def __init__(self, config, layer_idx: Optional[int] = None):
method forward (line 131) | def forward(
class Qwen2MLP (line 181) | class Qwen2MLP(nn.Module):
method __init__ (line 182) | def __init__(self, config):
method forward (line 193) | def forward(self, x):
class Qwen2RMSNorm (line 198) | class Qwen2RMSNorm(nn.Module):
method __init__ (line 199) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 207) | def forward(self, hidden_states):
method extra_repr (line 214) | def extra_repr(self):
class Qwen2_5_VLDecoderLayer (line 218) | class Qwen2_5_VLDecoderLayer(nn.Module):
method __init__ (line 219) | def __init__(self, config, layer_idx):
method forward (line 229) | def forward(
class NexusGenImageEmbeddingMerger (line 255) | class NexusGenImageEmbeddingMerger(nn.Module):
method __init__ (line 256) | def __init__(self, num_layers=1, out_channel=4096, expand_ratio=4, dev...
method get_position_ids (line 327) | def get_position_ids(self, image_grid_thw):
method forward (line 357) | def forward(self, embeds, embeds_grid, ref_embeds=None, ref_embeds_gri...
method state_dict_converter (line 373) | def state_dict_converter():
class NexusGenMergerStateDictConverter (line 377) | class NexusGenMergerStateDictConverter:
method __init__ (line 378) | def __init__(self):
method from_diffusers (line 381) | def from_diffusers(self, state_dict):
method from_civitai (line 384) | def from_civitai(self, state_dict):
class NexusGenAdapter (line 389) | class NexusGenAdapter(nn.Module):
method __init__ (line 393) | def __init__(self, input_dim=3584, output_dim=4096):
method forward (line 400) | def forward(self, x):
method state_dict_converter (line 404) | def state_dict_converter():
class NexusGenAdapterStateDictConverter (line 408) | class NexusGenAdapterStateDictConverter:
method __init__ (line 409) | def __init__(self):
method from_diffusers (line 412) | def from_diffusers(self, state_dict):
method from_civitai (line 415) | def from_civitai(self, state_dict):
FILE: diffsynth/models/qwen_image_controlnet.py
class BlockWiseControlBlock (line 6) | class BlockWiseControlBlock(torch.nn.Module):
method __init__ (line 8) | def __init__(self, dim: int = 3072):
method forward (line 16) | def forward(self, x, y):
method init_weights (line 23) | def init_weights(self):
class QwenImageBlockWiseControlNet (line 29) | class QwenImageBlockWiseControlNet(torch.nn.Module):
method __init__ (line 30) | def __init__(
method init_weight (line 46) | def init_weight(self):
method process_controlnet_conditioning (line 52) | def process_controlnet_conditioning(self, controlnet_conditioning):
method blockwise_forward (line 55) | def blockwise_forward(self, img, controlnet_conditioning, block_id):
FILE: diffsynth/models/qwen_image_dit.py
function qwen_image_flash_attention (line 14) | def qwen_image_flash_attention(q: torch.Tensor, k: torch.Tensor, v: torc...
class ApproximateGELU (line 42) | class ApproximateGELU(nn.Module):
method __init__ (line 43) | def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
method forward (line 47) | def forward(self, x: torch.Tensor) -> torch.Tensor:
function apply_rotary_emb_qwen (line 51) | def apply_rotary_emb_qwen(
class QwenEmbedRope (line 60) | class QwenEmbedRope(nn.Module):
method __init__ (line 61) | def __init__(self, theta: int, axes_dim: list[int], scale_rope=False):
method rope_params (line 80) | def rope_params(self, index, dim, theta=10000):
method _expand_pos_freqs_if_needed (line 94) | def _expand_pos_freqs_if_needed(self, video_fhw, txt_seq_lens):
method forward (line 123) | def forward(self, video_fhw, txt_seq_lens, device):
method forward_sampling (line 168) | def forward_sampling(self, video_fhw, txt_seq_lens, device):
class QwenEmbedLayer3DRope (line 228) | class QwenEmbedLayer3DRope(nn.Module):
method __init__ (line 229) | def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
method rope_params (line 254) | def rope_params(self, index, dim, theta=10000):
method forward (line 264) | def forward(self, video_fhw, txt_seq_lens, device):
method _compute_video_freqs (line 305) | def _compute_video_freqs(self, frame, height, width, idx=0):
method _compute_condition_freqs (line 324) | def _compute_condition_freqs(self, frame, height, width):
class QwenFeedForward (line 343) | class QwenFeedForward(nn.Module):
method __init__ (line 344) | def __init__(
method forward (line 357) | def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> tor...
class QwenDoubleStreamAttention (line 362) | class QwenDoubleStreamAttention(nn.Module):
method __init__ (line 363) | def __init__(
method forward (line 389) | def forward(
class QwenImageTransformerBlock (line 434) | class QwenImageTransformerBlock(nn.Module):
method __init__ (line 435) | def __init__(
method _modulate (line 470) | def _modulate(self, x, mod_params, index=None):
method forward (line 503) | def forward(
class QwenImageDiT (line 551) | class QwenImageDiT(torch.nn.Module):
method __init__ (line 552) | def __init__(
method process_entity_masks (line 585) | def process_entity_masks(self, latents, prompt_emb, prompt_emb_mask, e...
method forward (line 653) | def forward(
FILE: diffsynth/models/qwen_image_image2lora.py
class CompressedMLP (line 4) | class CompressedMLP(torch.nn.Module):
method __init__ (line 5) | def __init__(self, in_dim, mid_dim, out_dim, bias=False):
method forward (line 10) | def forward(self, x, residual=None):
class ImageEmbeddingToLoraMatrix (line 17) | class ImageEmbeddingToLoraMatrix(torch.nn.Module):
method __init__ (line 18) | def __init__(self, in_dim, compress_dim, lora_a_dim, lora_b_dim, rank):
method forward (line 26) | def forward(self, x, residual=None):
class SequencialMLP (line 32) | class SequencialMLP(torch.nn.Module):
method __init__ (line 33) | def __init__(self, length, in_dim, mid_dim, out_dim, bias=False):
method forward (line 41) | def forward(self, x):
class LoRATrainerBlock (line 49) | class LoRATrainerBlock(torch.nn.Module):
method __init__ (line 50) | def __init__(self, lora_patterns, in_dim=1536+4096, compress_dim=128, ...
method forward (line 63) | def forward(self, x, residual=None):
class QwenImageImage2LoRAModel (line 74) | class QwenImageImage2LoRAModel(torch.nn.Module):
method __init__ (line 75) | def __init__(self, num_blocks=60, use_residual=True, compress_dim=128,...
method forward (line 108) | def forward(self, x, residual=None):
method initialize_weights (line 119) | def initialize_weights(self):
FILE: diffsynth/models/qwen_image_text_encoder.py
class QwenImageTextEncoder (line 5) | class QwenImageTextEncoder(torch.nn.Module):
method __init__ (line 6) | def __init__(self):
method forward (line 148) | def forward(
FILE: diffsynth/models/qwen_image_vae.py
class QwenImageCausalConv3d (line 8) | class QwenImageCausalConv3d(torch.nn.Conv3d):
method __init__ (line 23) | def __init__(
method forward (line 43) | def forward(self, x, cache_x=None):
class QwenImageRMS_norm (line 54) | class QwenImageRMS_norm(nn.Module):
method __init__ (line 66) | def __init__(self, dim: int, channel_first: bool = True, images: bool ...
method forward (line 76) | def forward(self, x):
class QwenImageResidualBlock (line 81) | class QwenImageResidualBlock(nn.Module):
method __init__ (line 92) | def __init__(
method forward (line 112) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class QwenImageAttentionBlock (line 156) | class QwenImageAttentionBlock(nn.Module):
method __init__ (line 164) | def __init__(self, dim):
method forward (line 173) | def forward(self, x):
class QwenImageUpsample (line 202) | class QwenImageUpsample(nn.Upsample):
method forward (line 213) | def forward(self, x):
class QwenImageResample (line 218) | class QwenImageResample(nn.Module):
method __init__ (line 232) | def __init__(self, dim: int, mode: str) -> None:
method forward (line 257) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class QwenImageMidBlock (line 304) | class QwenImageMidBlock(nn.Module):
method __init__ (line 314) | def __init__(self, dim: int, dropout: float = 0.0, non_linearity: str ...
method forward (line 329) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class QwenImageEncoder3d (line 344) | class QwenImageEncoder3d(nn.Module):
method __init__ (line 359) | def __init__(
method forward (line 412) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class QwenImageUpBlock (line 453) | class QwenImageUpBlock(nn.Module):
method __init__ (line 466) | def __init__(
method forward (line 496) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class QwenImageDecoder3d (line 523) | class QwenImageDecoder3d(nn.Module):
method __init__ (line 538) | def __init__(
method forward (line 603) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class QwenImageVAE (line 642) | class QwenImageVAE(torch.nn.Module):
method __init__ (line 643) | def __init__(
method encode (line 709) | def encode(self, x, **kwargs):
method decode (line 719) | def decode(self, x, **kwargs):
FILE: diffsynth/models/sd_text_encoder.py
function low_version_attention (line 6) | def low_version_attention(query, key, value, attn_bias=None):
class Attention (line 16) | class Attention(torch.nn.Module):
method __init__ (line 18) | def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=Fal...
method interact_with_ipadapter (line 30) | def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=...
method torch_forward (line 38) | def torch_forward(self, hidden_states, encoder_hidden_states=None, att...
method xformers_forward (line 65) | def xformers_forward(self, hidden_states, encoder_hidden_states=None, ...
method forward (line 89) | def forward(self, hidden_states, encoder_hidden_states=None, attn_mask...
class CLIPEncoderLayer (line 96) | class CLIPEncoderLayer(torch.nn.Module):
method __init__ (line 97) | def __init__(self, embed_dim, intermediate_size, num_heads=12, head_di...
method quickGELU (line 107) | def quickGELU(self, x):
method forward (line 110) | def forward(self, hidden_states, attn_mask=None):
class SDTextEncoder (line 130) | class SDTextEncoder(torch.nn.Module):
method __init__ (line 131) | def __init__(self, embed_dim=768, vocab_size=49408, max_position_embed...
method attention_mask (line 149) | def attention_mask(self, length):
method forward (line 155) | def forward(self, input_ids, clip_skip=1):
method state_dict_converter (line 166) | def state_dict_converter():
class SDTextEncoderStateDictConverter (line 170) | class SDTextEncoderStateDictConverter:
method __init__ (line 171) | def __init__(self):
method from_diffusers (line 174) | def from_diffusers(self, state_dict):
method from_civitai (line 206) | def from_civitai(self, state_dict):
FILE: diffsynth/models/siglip2_image_encoder.py
class Siglip2ImageEncoder (line 8) | class Siglip2ImageEncoder(SiglipVisionTransformer):
method __init__ (line 9) | def __init__(self):
method forward (line 52) | def forward(self, image, torch_dtype=torch.bfloat16, device=get_device...
class Siglip2ImageEncoder428M (line 75) | class Siglip2ImageEncoder428M(Siglip2VisionModel):
method __init__ (line 76) | def __init__(self):
method forward (line 126) | def forward(self, image, torch_dtype=torch.bfloat16, device="cuda"):
FILE: diffsynth/models/step1x_connector.py
function attention (line 12) | def attention(q, k, v, attn_mask, mode="torch"):
class MLP (line 22) | class MLP(nn.Module):
method __init__ (line 25) | def __init__(
method forward (line 60) | def forward(self, x):
class TextProjection (line 70) | class TextProjection(nn.Module):
method __init__ (line 77) | def __init__(self, in_channels, hidden_size, act_layer, dtype=None, de...
method forward (line 94) | def forward(self, caption):
class TimestepEmbedder (line 101) | class TimestepEmbedder(nn.Module):
method __init__ (line 106) | def __init__(
method timestep_embedding (line 134) | def timestep_embedding(t, dim, max_period=10000):
method forward (line 162) | def forward(self, t):
function apply_gate (line 170) | def apply_gate(x, gate=None, tanh=False):
class RMSNorm (line 189) | class RMSNorm(nn.Module):
method __init__ (line 190) | def __init__(
method _norm (line 216) | def _norm(self, x):
method forward (line 229) | def forward(self, x):
function get_norm_layer (line 246) | def get_norm_layer(norm_layer):
function get_activation_layer (line 264) | def get_activation_layer(act_type):
class IndividualTokenRefinerBlock (line 284) | class IndividualTokenRefinerBlock(torch.nn.Module):
method __init__ (line 285) | def __init__(
method forward (line 358) | def forward(
class CrossAttnBlock (line 390) | class CrossAttnBlock(torch.nn.Module):
method __init__ (line 391) | def __init__(
method forward (line 449) | def forward(
class IndividualTokenRefiner (line 478) | class IndividualTokenRefiner(torch.nn.Module):
method __init__ (line 479) | def __init__(
method forward (line 517) | def forward(
class SingleTokenRefiner (line 547) | class SingleTokenRefiner(torch.nn.Module):
method __init__ (line 551) | def __init__(
method forward (line 604) | def forward(
class Qwen2Connector (line 633) | class Qwen2Connector(torch.nn.Module):
method __init__ (line 634) | def __init__(
method forward (line 655) | def forward(self, x,t,mask):
FILE: diffsynth/models/step1x_text_encoder.py
class Step1xEditEmbedder (line 7) | class Step1xEditEmbedder(torch.nn.Module):
method __init__ (line 8) | def __init__(self, model: QwenImageTextEncoder, processor, max_length=...
method model_forward (line 27) | def model_forward(
method forward (line 74) | def forward(self, caption, ref_images):
FILE: diffsynth/models/wan_video_animate_adapter.py
function attention (line 26) | def attention(
class CausalConv1d (line 50) | class CausalConv1d(nn.Module):
method __init__ (line 52) | def __init__(self, chan_in, chan_out, kernel_size=3, stride=1, dilatio...
method forward (line 61) | def forward(self, x):
class FaceEncoder (line 67) | class FaceEncoder(nn.Module):
method __init__ (line 68) | def __init__(self, in_dim: int, hidden_dim: int, num_heads=int, dtype=...
method forward (line 88) | def forward(self, x):
class RMSNorm (line 118) | class RMSNorm(nn.Module):
method __init__ (line 119) | def __init__(
method _norm (line 145) | def _norm(self, x):
method forward (line 158) | def forward(self, x):
function get_norm_layer (line 175) | def get_norm_layer(norm_layer):
class FaceAdapter (line 193) | class FaceAdapter(nn.Module):
method __init__ (line 194) | def __init__(
method forward (line 222) | def forward(
class FaceBlock (line 235) | class FaceBlock(nn.Module):
method __init__ (line 236) | def __init__(
method forward (line 272) | def forward(
function custom_qr (line 314) | def custom_qr(input_tensor):
function fused_leaky_relu (line 321) | def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5):
function upfirdn2d_native (line 325) | def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, ...
function upfirdn2d (line 345) | def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
function make_kernel (line 349) | def make_kernel(k):
class FusedLeakyReLU (line 357) | class FusedLeakyReLU(nn.Module):
method __init__ (line 358) | def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5):
method forward (line 364) | def forward(self, input):
class Blur (line 369) | class Blur(nn.Module):
method __init__ (line 370) | def __init__(self, kernel, pad, upsample_factor=1):
method forward (line 382) | def forward(self, input):
class ScaledLeakyReLU (line 386) | class ScaledLeakyReLU(nn.Module):
method __init__ (line 387) | def __init__(self, negative_slope=0.2):
method forward (line 392) | def forward(self, input):
class EqualConv2d (line 396) | class EqualConv2d(nn.Module):
method __init__ (line 397) | def __init__(self, in_channel, out_channel, kernel_size, stride=1, pad...
method forward (line 411) | def forward(self, input):
method __repr__ (line 415) | def __repr__(self):
class EqualLinear (line 422) | class EqualLinear(nn.Module):
method __init__ (line 423) | def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, ...
method forward (line 438) | def forward(self, input):
method __repr__ (line 448) | def __repr__(self):
class ConvLayer (line 452) | class ConvLayer(nn.Sequential):
method __init__ (line 453) | def __init__(
class ResBlock (line 492) | class ResBlock(nn.Module):
method __init__ (line 493) | def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
method forward (line 501) | def forward(self, input):
class EncoderApp (line 511) | class EncoderApp(nn.Module):
method __init__ (line 512) | def __init__(self, size, w_dim=512):
method forward (line 541) | def forward(self, x):
class Encoder (line 552) | class Encoder(nn.Module):
method __init__ (line 553) | def __init__(self, size, dim=512, dim_motion=20):
method enc_app (line 567) | def enc_app(self, x):
method enc_motion (line 571) | def enc_motion(self, x):
class Direction (line 577) | class Direction(nn.Module):
method __init__ (line 578) | def __init__(self, motion_dim):
method forward (line 582) | def forward(self, input):
class Synthesis (line 595) | class Synthesis(nn.Module):
method __init__ (line 596) | def __init__(self, motion_dim):
class Generator (line 601) | class Generator(nn.Module):
method __init__ (line 602) | def __init__(self, size, style_dim=512, motion_dim=20):
method get_motion (line 608) | def get_motion(self, img):
class WanAnimateAdapter (line 615) | class WanAnimateAdapter(torch.nn.Module):
method __init__ (line 616) | def __init__(self):
method after_patch_embedding (line 623) | def after_patch_embedding(self, x: List[torch.Tensor], pose_latents, f...
method after_transformer_block (line 645) | def after_transformer_block(self, block_idx, x, motion_vec, motion_mas...
FILE: diffsynth/models/wan_video_camera_controller.py
class SimpleAdapter (line 8) | class SimpleAdapter(nn.Module):
method __init__ (line 9) | def __init__(self, in_dim, out_dim, kernel_size, stride, num_residual_...
method forward (line 24) | def forward(self, x):
method process_camera_coordinates (line 46) | def process_camera_coordinates(
class ResidualBlock (line 63) | class ResidualBlock(nn.Module):
method __init__ (line 64) | def __init__(self, dim):
method forward (line 70) | def forward(self, x):
class Camera (line 77) | class Camera(object):
method __init__ (line 80) | def __init__(self, entry):
function get_relative_pose (line 92) | def get_relative_pose(cam_params):
function custom_meshgrid (line 109) | def custom_meshgrid(*args):
function ray_condition (line 114) | def ray_condition(K, c2w, H, W, device):
function process_pose_file (line 150) | def process_pose_file(cam_params, width=672, height=384, original_pose_w...
function generate_camera_coordinates (line 184) | def generate_camera_coordinates(
FILE: diffsynth/models/wan_video_dit.py
function flash_attention (line 30) | def flash_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, n...
function modulate (line 66) | def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
function sinusoidal_embedding_1d (line 70) | def sinusoidal_embedding_1d(dim, position):
function precompute_freqs_cis_3d (line 77) | def precompute_freqs_cis_3d(dim: int, end: int = 1024, theta: float = 10...
function precompute_freqs_cis (line 85) | def precompute_freqs_cis(dim: int, end: int = 1024, theta: float = 10000...
function rope_apply (line 94) | def rope_apply(x, freqs, num_heads):
function set_to_torch_norm (line 103) | def set_to_torch_norm(models):
class RMSNorm (line 110) | class RMSNorm(nn.Module):
method __init__ (line 111) | def __init__(self, dim, eps=1e-5):
method norm (line 118) | def norm(self, x):
method forward (line 121) | def forward(self, x):
class AttentionModule (line 129) | class AttentionModule(nn.Module):
method __init__ (line 130) | def __init__(self, num_heads):
method forward (line 134) | def forward(self, q, k, v):
class SelfAttention (line 139) | class SelfAttention(nn.Module):
method __init__ (line 140) | def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
method forward (line 155) | def forward(self, x, freqs):
class CrossAttention (line 165) | class CrossAttention(nn.Module):
method __init__ (line 166) | def __init__(self, dim: int, num_heads: int, eps: float = 1e-6, has_im...
method forward (line 186) | def forward(self, x: torch.Tensor, y: torch.Tensor):
class GateModule (line 204) | class GateModule(nn.Module):
method __init__ (line 205) | def __init__(self,):
method forward (line 208) | def forward(self, x, gate, residual):
class DiTBlock (line 211) | class DiTBlock(nn.Module):
method __init__ (line 212) | def __init__(self, has_image_input: bool, dim: int, num_heads: int, ff...
method forward (line 229) | def forward(self, x, context, t_mod, freqs):
class MLP (line 248) | class MLP(torch.nn.Module):
method __init__ (line 249) | def __init__(self, in_dim, out_dim, has_pos_emb=False):
method forward (line 262) | def forward(self, x):
class Head (line 268) | class Head(nn.Module):
method __init__ (line 269) | def __init__(self, dim: int, out_dim: int, patch_size: Tuple[int, int,...
method forward (line 277) | def forward(self, x, t_mod):
function wantodance_torch_dfs (line 287) | def wantodance_torch_dfs(model: nn.Module, parent_name='root'):
class WanToDanceInjector (line 303) | class WanToDanceInjector(nn.Module):
method __init__ (line 304) | def __init__(self, all_modules, all_modules_names, dim=2048, num_heads...
class WanModel (line 338) | class WanModel(torch.nn.Module):
method __init__ (line 339) | def __init__(
method prepare_wantodance (line 421) | def prepare_wantodance(
method wantodance_after_transformer_block (line 473) | def wantodance_after_transformer_block(self, block_idx, hidden_states):
method patchify (line 489) | def patchify(self, x: torch.Tensor, control_camera_latents_input: Opti...
method unpatchify (line 500) | def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
method forward (line 507) | def forward(self,
FILE: diffsynth/models/wan_video_dit_s2v.py
function torch_dfs (line 10) | def torch_dfs(model: nn.Module, parent_name='root'):
function rope_precompute (line 27) | def rope_precompute(x, grid_sizes, freqs, start=None):
class CausalConv1d (line 86) | class CausalConv1d(nn.Module):
method __init__ (line 88) | def __init__(self, chan_in, chan_out, kernel_size=3, stride=1, dilatio...
method forward (line 97) | def forward(self, x):
class MotionEncoder_tc (line 102) | class MotionEncoder_tc(nn.Module):
method __init__ (line 104) | def __init__(self, in_dim: int, hidden_dim: int, num_heads=int, need_g...
method forward (line 126) | def forward(self, x):
class FramePackMotioner (line 172) | class FramePackMotioner(nn.Module):
method __init__ (line 174) | def __init__(self, inner_dim=1024, num_heads=16, zip_frame_buckets=[1,...
method forward (line 186) | def forward(self, motion_latents, add_last_motion=2):
class AdaLayerNorm (line 260) | class AdaLayerNorm(nn.Module):
method __init__ (line 262) | def __init__(
method forward (line 273) | def forward(self, x, temb):
class AudioInjector_WAN (line 282) | class AudioInjector_WAN(nn.Module):
method __init__ (line 284) | def __init__(
class CausalAudioEncoder (line 322) | class CausalAudioEncoder(nn.Module):
method __init__ (line 324) | def __init__(self, dim=5120, num_layers=25, out_dim=2048, num_token=4,...
method forward (line 332) | def forward(self, features):
class WanS2VDiTBlock (line 342) | class WanS2VDiTBlock(DiTBlock):
method forward (line 344) | def forward(self, x, context, t_mod, seq_len_x, freqs):
class WanS2VModel (line 360) | class WanS2VModel(torch.nn.Module):
method __init__ (line 362) | def __init__(
method patchify (line 425) | def patchify(self, x: torch.Tensor):
method unpatchify (line 430) | def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
method process_motion_frame_pack (line 442) | def process_motion_frame_pack(self, motion_latents, drop_motion_frames...
method inject_motion (line 449) | def inject_motion(self, x, rope_embs, mask_input, motion_latents, drop...
method after_transformer_block (line 460) | def after_transformer_block(self, block_idx, hidden_states, audio_emb_...
method cal_audio_emb (line 485) | def cal_audio_emb(self, audio_input, motion_frames=[73, 19]):
method get_grid_sizes (line 492) | def get_grid_sizes(self, grid_size_x, grid_size_ref):
method forward (line 504) | def forward(
FILE: diffsynth/models/wan_video_image_encoder.py
class SelfAttention (line 14) | class SelfAttention(nn.Module):
method __init__ (line 16) | def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
method forward (line 31) | def forward(self, x, mask):
method __init__ (line 236) | def __init__(self,
method forward (line 255) | def forward(self, x):
class AttentionBlock (line 53) | class AttentionBlock(nn.Module):
method __init__ (line 55) | def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
method forward (line 70) | def forward(self, x, mask):
method __init__ (line 291) | def __init__(self,
method forward (line 323) | def forward(self, x):
class XLMRoberta (line 80) | class XLMRoberta(nn.Module):
method __init__ (line 85) | def __init__(self,
method forward (line 122) | def forward(self, ids):
function xlm_roberta_large (line 150) | def xlm_roberta_large(pretrained=False,
function pos_interpolate (line 203) | def pos_interpolate(pos, seq_len):
class QuickGELU (line 222) | class QuickGELU(nn.Module):
method forward (line 224) | def forward(self, x):
class LayerNorm (line 228) | class LayerNorm(nn.LayerNorm):
method forward (line 230) | def forward(self, x):
class SelfAttention (line 234) | class SelfAttention(nn.Module):
method __init__ (line 16) | def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
method forward (line 31) | def forward(self, x, mask):
method __init__ (line 236) | def __init__(self,
method forward (line 255) | def forward(self, x):
class SwiGLU (line 271) | class SwiGLU(nn.Module):
method __init__ (line 273) | def __init__(self, dim, mid_dim):
method forward (line 283) | def forward(self, x):
class AttentionBlock (line 289) | class AttentionBlock(nn.Module):
method __init__ (line 55) | def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
method forward (line 70) | def forward(self, x, mask):
method __init__ (line 291) | def __init__(self,
method forward (line 323) | def forward(self, x):
class AttentionPool (line 333) | class AttentionPool(nn.Module):
method __init__ (line 335) | def __init__(self,
method forward (line 363) | def forward(self, x):
class VisionTransformer (line 386) | class VisionTransformer(nn.Module):
method __init__ (line 388) | def __init__(self,
method forward (line 456) | def forward(self, x, interpolation=False, use_31_block=False):
class CLIP (line 481) | class CLIP(nn.Module):
method __init__ (line 483) | def __init__(self,
method forward (line 571) | def forward(self, imgs, txt_ids):
method init_weights (line 582) | def init_weights(self):
method param_groups (line 601) | def param_groups(self):
class XLMRobertaWithHead (line 617) | class XLMRobertaWithHead(XLMRoberta):
method __init__ (line 619) | def __init__(self, **kwargs):
method forward (line 629) | def forward(self, ids):
class XLMRobertaCLIP (line 642) | class XLMRobertaCLIP(nn.Module):
method __init__ (line 644) | def __init__(self,
method forward (line 710) | def forward(self, imgs, txt_ids):
method param_groups (line 722) | def param_groups(self):
function _clip (line 738) | def _clip(pretrained=False,
function clip_xlm_roberta_vit_h_14 (line 822) | def clip_xlm_roberta_vit_h_14(
class WanImageEncoder (line 852) | class WanImageEncoder(torch.nn.Module):
method __init__ (line 854) | def __init__(self):
method encode_image (line 864) | def encode_image(self, videos):
FILE: diffsynth/models/wan_video_mot.py
class MotSelfAttention (line 7) | class MotSelfAttention(SelfAttention):
method __init__ (line 8) | def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
method forward (line 10) | def forward(self, x, freqs, is_before_attn=False):
class MotWanAttentionBlock (line 22) | class MotWanAttentionBlock(DiTBlock):
method __init__ (line 23) | def __init__(self, has_image_input, dim, num_heads, ffn_dim, eps=1e-6,...
method forward (line 30) | def forward(self, wan_block, x, context, t_mod, freqs, x_mot, context_...
class MotWanModel (line 94) | class MotWanModel(torch.nn.Module):
method __init__ (line 95) | def __init__(
method patchify (line 142) | def patchify(self, x: torch.Tensor):
method compute_freqs_mot (line 146) | def compute_freqs_mot(self, f, h, w, end: int = 1024, theta: float = 1...
method forward (line 166) | def forward(self, wan_block, x, context, t_mod, freqs, x_mot, context_...
FILE: diffsynth/models/wan_video_motion_controller.py
class WanMotionControllerModel (line 7) | class WanMotionControllerModel(torch.nn.Module):
method __init__ (line 8) | def __init__(self, freq_dim=256, dim=1536):
method forward (line 19) | def forward(self, motion_bucket_id):
method init (line 24) | def init(self):
FILE: diffsynth/models/wan_video_text_encoder.py
function fp16_clamp (line 11) | def fp16_clamp(x):
class GELU (line 18) | class GELU(nn.Module):
method forward (line 20) | def forward(self, x):
class T5LayerNorm (line 25) | class T5LayerNorm(nn.Module):
method __init__ (line 27) | def __init__(self, dim, eps=1e-6):
method forward (line 33) | def forward(self, x):
class T5Attention (line 41) | class T5Attention(nn.Module):
method __init__ (line 43) | def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
method forward (line 58) | def forward(self, x, context=None, mask=None, pos_bias=None):
class T5FeedForward (line 95) | class T5FeedForward(nn.Module):
method __init__ (line 97) | def __init__(self, dim, dim_ffn, dropout=0.1):
method forward (line 108) | def forward(self, x):
class T5SelfAttention (line 116) | class T5SelfAttention(nn.Module):
method __init__ (line 118) | def __init__(self,
method forward (line 142) | def forward(self, x, mask=None, pos_bias=None):
class T5RelativeEmbedding (line 150) | class T5RelativeEmbedding(nn.Module):
method __init__ (line 152) | def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
method forward (line 162) | def forward(self, lq, lk):
method _relative_position_bucket (line 174) | def _relative_position_bucket(self, rel_pos):
function init_weights (line 195) | def init_weights(m):
class WanTextEncoder (line 212) | class WanTextEncoder(torch.nn.Module):
method __init__ (line 214) | def __init__(self,
method forward (line 248) | def forward(self, ids, mask=None):
function basic_clean (line 260) | def basic_clean(text):
function whitespace_clean (line 266) | def whitespace_clean(text):
function canonicalize (line 272) | def canonicalize(text, keep_punctuation_exact_string=None):
class HuggingfaceTokenizer (line 285) | class HuggingfaceTokenizer:
method __init__ (line 287) | def __init__(self, name, seq_len=None, clean=None, **kwargs):
method __call__ (line 297) | def __call__(self, sequence, **kwargs):
method _clean (line 323) | def _clean(self, text):
FILE: diffsynth/models/wan_video_vace.py
class VaceWanAttentionBlock (line 5) | class VaceWanAttentionBlock(DiTBlock):
method __init__ (line 6) | def __init__(self, has_image_input, dim, num_heads, ffn_dim, eps=1e-6,...
method forward (line 13) | def forward(self, c, x, context, t_mod, freqs):
class VaceWanModel (line 27) | class VaceWanModel(torch.nn.Module):
method __init__ (line 28) | def __init__(
method forward (line 53) | def forward(
FILE: diffsynth/models/wan_video_vae.py
function check_is_instance (line 11) | def check_is_instance(model, module_class):
function block_causal_mask (line 19) | def block_causal_mask(x, block_size):
class CausalConv3d (line 33) | class CausalConv3d(nn.Conv3d):
method __init__ (line 38) | def __init__(self, *args, **kwargs):
method forward (line 44) | def forward(self, x, cache_x=None):
class RMS_norm (line 55) | class RMS_norm(nn.Module):
method __init__ (line 57) | def __init__(self, dim, channel_first=True, images=True, bias=False):
method forward (line 67) | def forward(self, x):
class Upsample (line 73) | class Upsample(nn.Upsample):
method forward (line 75) | def forward(self, x):
class Resample (line 82) | class Resample(nn.Module):
method __init__ (line 84) | def __init__(self, dim, mode):
method forward (line 120) | def forward(self, x, feat_cache=None, feat_idx=[0]):
method init_weight (line 176) | def init_weight(self, conv):
method init_weight2 (line 187) | def init_weight2(self, conv):
function patchify (line 199) | def patchify(x, patch_size):
function unpatchify (line 214) | def unpatchify(x, patch_size):
class Resample38 (line 227) | class Resample38(Resample):
method __init__ (line 229) | def __init__(self, dim, mode):
class ResidualBlock (line 267) | class ResidualBlock(nn.Module):
method __init__ (line 269) | def __init__(self, in_dim, out_dim, dropout=0.0):
method forward (line 283) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class AttentionBlock (line 304) | class AttentionBlock(nn.Module):
method __init__ (line 309) | def __init__(self, dim):
method forward (line 321) | def forward(self, x):
class AvgDown3D (line 345) | class AvgDown3D(nn.Module):
method __init__ (line 346) | def __init__(
method forward (line 363) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class DupUp3D (line 398) | class DupUp3D(nn.Module):
method __init__ (line 399) | def __init__(
method forward (line 417) | def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
class Down_ResidualBlock (line 442) | class Down_ResidualBlock(nn.Module):
method __init__ (line 443) | def __init__(
method forward (line 469) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class Up_ResidualBlock (line 477) | class Up_ResidualBlock(nn.Module):
method __init__ (line 478) | def __init__(
method forward (line 506) | def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
class Encoder3d (line 517) | class Encoder3d(nn.Module):
method __init__ (line 519) | def __init__(self,
method forward (line 569) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class Encoder3d_38 (line 620) | class Encoder3d_38(nn.Module):
method __init__ (line 622) | def __init__(self,
method forward (line 679) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class Decoder3d (line 736) | class Decoder3d(nn.Module):
method __init__ (line 738) | def __init__(self,
method forward (line 789) | def forward(self, x, feat_cache=None, feat_idx=[0]):
class Decoder3d_38 (line 842) | class Decoder3d_38(nn.Module):
method __init__ (line 844) | def __init__(self,
method forward (line 889) | def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
function count_conv3d (line 943) | def count_conv3d(model):
class VideoVAE_ (line 951) | class VideoVAE_(nn.Module):
method __init__ (line 953) | def __init__(self,
method forward (line 978) | def forward(self, x):
method encode (line 984) | def encode(self, x, scale):
method decode (line 1011) | def decode(self, z, scale):
method reparameterize (line 1036) | def reparameterize(self, mu, log_var):
method sample (line 1041) | def sample(self, imgs, deterministic=False):
method clear_cache (line 1048) | def clear_cache(self):
class WanVideoVAE (line 1058) | class WanVideoVAE(nn.Module):
method __init__ (line 1060) | def __init__(self, z_dim=16):
method build_1d_mask (line 1081) | def build_1d_mask(self, length, left_bound, right_bound, border_width):
method build_mask (line 1090) | def build_mask(self, data, is_bound, border_width):
method tiled_decode (line 1103) | def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
method tiled_encode (line 1155) | def tiled_encode(self, video, device, tile_size, tile_stride):
method single_encode (line 1206) | def single_encode(self, video, device):
method single_decode (line 1212) | def single_decode(self, hidden_state, device):
method encode (line 1218) | def encode(self, videos, device, tiled=False, tile_size=(34, 34), tile...
method decode (line 1235) | def decode(self, hidden_states, device, tiled=False, tile_size=(34, 34...
method encode_framewise (line 1250) | def encode_framewise(self, videos, device):
method decode_framewise (line 1258) | def decode_framewise(self, hidden_states, device):
method state_dict_converter (line 1267) | def state_dict_converter():
class WanVideoVAEStateDictConverter (line 1271) | class WanVideoVAEStateDictConverter:
method __init__ (line 1273) | def __init__(self):
method from_civitai (line 1276) | def from_civitai(self, state_dict):
class VideoVAE38_ (line 1285) | class VideoVAE38_(VideoVAE_):
method __init__ (line 1287) | def __init__(self,
method encode (line 1314) | def encode(self, x, scale):
method decode (line 1342) | def decode(self, z, scale):
class WanVideoVAE38 (line 1370) | class WanVideoVAE38(WanVideoVAE):
method __init__ (line 1372) | def __init__(self, z_dim=48, dim=160):
FILE: diffsynth/models/wantodance.py
function exists (line 15) | def exists(val):
function broadcat (line 19) | def broadcat(tensors, dim=-1):
function rotate_half (line 43) | def rotate_half(x):
function apply_rotary_emb (line 50) | def apply_rotary_emb(freqs, t, start_index=0):
function apply_learned_rotations (line 69) | def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
class WanToDanceRotaryEmbedding (line 81) | class WanToDanceRotaryEmbedding(nn.Module):
method __init__ (line 82) | def __init__(
method rotate_queries_or_keys (line 113) | def rotate_queries_or_keys(self, t, seq_dim=-2):
method forward (line 121) | def forward(self, t, cache_key=None):
class WanToDanceMusicEncoderLayer (line 140) | class WanToDanceMusicEncoderLayer(nn.Module):
method __init__ (line 141) | def __init__(
method _sa_block (line 175) | def _sa_block(
method _ff_block (line 190) | def _ff_block(self, x: Tensor) -> Tensor:
method forward (line 194) | def forward(
FILE: diffsynth/models/wav2vec.py
function get_sample_indices (line 7) | def get_sample_indices(original_fps, total_frames, target_fps, num_sampl...
function linear_interpolation (line 30) | def linear_interpolation(features, input_fps, output_fps, output_len=None):
class WanS2VAudioEncoder (line 45) | class WanS2VAudioEncoder(torch.nn.Module):
method __init__ (line 47) | def __init__(self):
method extract_audio_feat (line 102) | def extract_audio_feat(self, input_audio, sample_rate, processor, retu...
method get_audio_embed_bucket (line 114) | def get_audio_embed_bucket(self, audio_embed, stride=2, batch_frames=1...
method get_audio_embed_bucket_fps (line 147) | def get_audio_embed_bucket_fps(self, audio_embed, fps=16, batch_frames...
method get_audio_feats_per_inference (line 186) | def get_audio_feats_per_inference(self, input_audio, sample_rate, proc...
FILE: diffsynth/models/z_image_controlnet.py
class ZImageControlTransformerBlock (line 8) | class ZImageControlTransformerBlock(ZImageTransformerBlock):
method __init__ (line 9) | def __init__(
method forward (line 26) | def forward(self, c, x, **kwargs):
class ZImageControlNet (line 41) | class ZImageControlNet(torch.nn.Module):
method __init__ (line 42) | def __init__(
method forward_layers (line 55) | def forward_layers(
method forward_refiner (line 90) | def forward_refiner(
FILE: diffsynth/models/z_image_dit.py
class TimestepEmbedder (line 20) | class TimestepEmbedder(nn.Module):
method __init__ (line 21) | def __init__(self, out_size, mid_size=None, frequency_embedding_size=2...
method timestep_embedding (line 42) | def timestep_embedding(t, dim, max_period=10000):
method forward (line 54) | def forward(self, t):
class FeedForward (line 60) | class FeedForward(nn.Module):
method __init__ (line 61) | def __init__(self, dim: int, hidden_dim: int):
method _forward_silu_gating (line 67) | def _forward_silu_gating(self, x1, x3):
method forward (line 70) | def forward(self, x):
class Attention (line 74) | class Attention(torch.nn.Module):
method __init__ (line 76) | def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=Fal...
method apply_rotary_emb (line 92) | def apply_rotary_emb(self, x_in: torch.Tensor, freqs_cis: torch.Tensor...
method forward (line 99) | def forward(self, hidden_states, freqs_cis, attention_mask):
function select_per_token (line 142) | def select_per_token(
class ZImageTransformerBlock (line 156) | class ZImageTransformerBlock(nn.Module):
method __init__ (line 157) | def __init__(
method forward (line 194) | def forward(
class FinalLayer (line 251) | class FinalLayer(nn.Module):
method __init__ (line 252) | def __init__(self, hidden_size, out_channels):
method forward (line 262) | def forward(self, x, c=None, noise_mask=None, c_noisy=None, c_clean=No...
class RopeEmbedder (line 281) | class RopeEmbedder:
method __init__ (line 282) | def __init__(
method precompute_freqs_cis (line 295) | def precompute_freqs_cis(dim: List[int], end: List[int], theta: float ...
method __call__ (line 307) | def __call__(self, ids: torch.Tensor):
class ZImageDiT (line 326) | class ZImageDiT(nn.Module):
method __init__ (line 330) | def __init__(
method unpatchify (line 450) | def unpatchify(
method create_coordinate_grid (line 503) | def create_coordinate_grid(size, start=None, device=None):
method patchify_and_embed (line 511) | def patchify_and_embed(
method patchify_controlnet (line 617) | def patchify_controlnet(
method _prepare_sequence (line 683) | def _prepare_sequence(
method _build_unified_sequence (line 725) | def _build_unified_sequence(
method _pad_with_ids (line 806) | def _pad_with_ids(
method _patchify_image (line 843) | def _patchify_image(self, image: torch.Tensor, patch_size: int, f_patc...
method patchify_and_embed_omni (line 852) | def patchify_and_embed_omni(
method forward (line 1004) | def forward(
FILE: diffsynth/models/z_image_image2lora.py
class LoRATrainerBlock (line 5) | class LoRATrainerBlock(torch.nn.Module):
method __init__ (line 6) | def __init__(self, lora_patterns, in_dim=1536+4096, compress_dim=128, ...
method forward (line 20) | def forward(self, x, residual=None):
class ZImageImage2LoRAComponent (line 31) | class ZImageImage2LoRAComponent(torch.nn.Module):
method __init__ (line 32) | def __init__(self, lora_patterns, prefix, num_blocks=60, use_residual=...
method forward (line 44) | def forward(self, x, residual=None):
class ZImageImage2LoRAModel (line 56) | class ZImageImage2LoRAModel(torch.nn.Module):
method __init__ (line 57) | def __init__(self, use_residual=False, compress_dim=64, rank=4, residu...
method forward (line 96) | def forward(self, x, residual=None):
method initialize_weights (line 103) | def initialize_weights(self):
class ImageEmb2LoRAWeightCompressed (line 115) | class ImageEmb2LoRAWeightCompressed(torch.nn.Module):
method __init__ (line 116) | def __init__(self, in_dim, out_dim, emb_dim, rank):
method forward (line 123) | def forward(self, x):
class ZImageImage2LoRAModelCompressed (line 130) | class ZImageImage2LoRAModelCompressed(torch.nn.Module):
method __init__ (line 131) | def __init__(self, emb_dim=1536+4096, rank=32):
method forward (line 169) | def forward(self, x, residual=None):
method initialize_weights (line 179) | def initialize_weights(self):
FILE: diffsynth/models/z_image_text_encoder.py
class ZImageTextEncoder (line 5) | class ZImageTextEncoder(torch.nn.Module):
method __init__ (line 6) | def __init__(self, model_size="4B"):
method forward (line 103) | def forward(self, *args, **kwargs):
FILE: diffsynth/pipelines/anima_image.py
class AnimaImagePipeline (line 21) | class AnimaImagePipeline(BasePipeline):
method __init__ (line 23) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method from_pretrained (line 45) | def from_pretrained(
method __call__ (line 73) | def __call__(
class AnimaUnit_ShapeChecker (line 135) | class AnimaUnit_ShapeChecker(PipelineUnit):
method __init__ (line 136) | def __init__(self):
method process (line 142) | def process(self, pipe: AnimaImagePipeline, height, width):
class AnimaUnit_NoiseInitializer (line 148) | class AnimaUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 149) | def __init__(self):
method process (line 155) | def process(self, pipe: AnimaImagePipeline, height, width, seed, rand_...
class AnimaUnit_InputImageEmbedder (line 161) | class AnimaUnit_InputImageEmbedder(PipelineUnit):
method __init__ (line 162) | def __init__(self):
method process (line 169) | def process(self, pipe: AnimaImagePipeline, input_image, noise):
class AnimaUnit_PromptEmbedder (line 189) | class AnimaUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 190) | def __init__(self):
method encode_prompt (line 199) | def encode_prompt(
method process (line 236) | def process(self, pipe: AnimaImagePipeline, prompt):
function model_fn_anima (line 242) | def model_fn_anima(
FILE: diffsynth/pipelines/flux2_image.py
class Flux2ImagePipeline (line 21) | class Flux2ImagePipeline(BasePipeline):
method __init__ (line 23) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method from_pretrained (line 48) | def from_pretrained(
method __call__ (line 74) | def __call__(
class Flux2Unit_ShapeChecker (line 141) | class Flux2Unit_ShapeChecker(PipelineUnit):
method __init__ (line 142) | def __init__(self):
method process (line 148) | def process(self, pipe: Flux2ImagePipeline, height, width):
class Flux2Unit_PromptEmbedder (line 153) | class Flux2Unit_PromptEmbedder(PipelineUnit):
method __init__ (line 154) | def __init__(self):
method format_text_input (line 164) | def format_text_input(self, prompts: List[str], system_message: str = ...
method get_mistral_3_small_prompt_embeds (line 181) | def get_mistral_3_small_prompt_embeds(
method prepare_text_ids (line 235) | def prepare_text_ids(
method encode_prompt (line 254) | def encode_prompt(
method process (line 288) | def process(self, pipe: Flux2ImagePipeline, prompt):
class Flux2Unit_Qwen3PromptEmbedder (line 301) | class Flux2Unit_Qwen3PromptEmbedder(PipelineUnit):
method __init__ (line 302) | def __init__(self):
method get_qwen3_prompt_embeds (line 312) | def get_qwen3_prompt_embeds(
method prepare_text_ids (line 367) | def prepare_text_ids(
method encode_prompt (line 386) | def encode_prompt(
method process (line 417) | def process(self, pipe: Flux2ImagePipeline, prompt):
class Flux2Unit_NoiseInitializer (line 430) | class Flux2Unit_NoiseInitializer(PipelineUnit):
method __init__ (line 431) | def __init__(self):
method process (line 437) | def process(self, pipe: Flux2ImagePipeline, height, width, seed, rand_...
class Flux2Unit_InputImageEmbedder (line 446) | class Flux2Unit_InputImageEmbedder(PipelineUnit):
method __init__ (line 447) | def __init__(self):
method process (line 454) | def process(self, pipe: Flux2ImagePipeline, input_image, noise):
class Flux2Unit_EditImageEmbedder (line 468) | class Flux2Unit_EditImageEmbedder(PipelineUnit):
method __init__ (line 469) | def __init__(self):
method calculate_dimensions (line 476) | def calculate_dimensions(self, target_area, ratio):
method crop_and_resize (line 484) | def crop_and_resize(self, image, target_height, target_width):
method edit_image_auto_resize (line 495) | def edit_image_auto_resize(self, edit_image):
method process_image_ids (line 499) | def process_image_ids(self, image_latents, scale=10):
method process (line 516) | def process(self, pipe: Flux2ImagePipeline, edit_image, edit_image_aut...
class Flux2Unit_ImageIDs (line 537) | class Flux2Unit_ImageIDs(PipelineUnit):
method __init__ (line 538) | def __init__(self):
method prepare_latent_ids (line 544) | def prepare_latent_ids(self, height, width):
method process (line 558) | def process(self, pipe: Flux2ImagePipeline, height, width):
function model_fn_flux2 (line 563) | def model_fn_flux2(
FILE: diffsynth/pipelines/flux_image.py
class MultiControlNet (line 23) | class MultiControlNet(torch.nn.Module):
method __init__ (line 24) | def __init__(self, models: list[torch.nn.Module]):
method process_single_controlnet (line 30) | def process_single_controlnet(self, controlnet_input: ControlNetInput,...
method forward (line 41) | def forward(self, conditionings: list[torch.Tensor], controlnet_inputs...
class FluxImagePipeline (line 57) | class FluxImagePipeline(BasePipeline):
method __init__ (line 59) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method enable_lora_merger (line 108) | def enable_lora_merger(self):
method from_pretrained (line 119) | def from_pretrained(
method __call__ (line 180) | def __call__(
class FluxImageUnit_ShapeChecker (line 298) | class FluxImageUnit_ShapeChecker(PipelineUnit):
method __init__ (line 299) | def __init__(self):
method process (line 302) | def process(self, pipe: FluxImagePipeline, height, width):
class FluxImageUnit_NoiseInitializer (line 308) | class FluxImageUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 309) | def __init__(self):
method process (line 312) | def process(self, pipe: FluxImagePipeline, height, width, seed, rand_d...
class FluxImageUnit_InputImageEmbedder (line 318) | class FluxImageUnit_InputImageEmbedder(PipelineUnit):
method __init__ (line 319) | def __init__(self):
method process (line 326) | def process(self, pipe: FluxImagePipeline, input_image, noise, tiled, ...
class FluxImageUnit_PromptEmbedder (line 340) | class FluxImageUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 341) | def __init__(self):
method encode_prompt_using_clip (line 351) | def encode_prompt_using_clip(self, prompt, text_encoder, tokenizer, ma...
method encode_prompt_using_t5 (line 362) | def encode_prompt_using_t5(self, prompt, text_encoder, tokenizer, max_...
method encode_prompt (line 373) | def encode_prompt(
method process (line 389) | def process(self, pipe: FluxImagePipeline, prompt, t5_sequence_length,...
class FluxImageUnit_ImageIDs (line 401) | class FluxImageUnit_ImageIDs(PipelineUnit):
method __init__ (line 402) | def __init__(self):
method process (line 405) | def process(self, pipe: FluxImagePipeline, latents):
class FluxImageUnit_EmbeddedGuidanceEmbedder (line 411) | class FluxImageUnit_EmbeddedGuidanceEmbedder(PipelineUnit):
method __init__ (line 412) | def __init__(self):
method process (line 415) | def process(self, pipe: FluxImagePipeline, embedded_guidance, latents):
class FluxImageUnit_Kontext (line 421) | class FluxImageUnit_Kontext(PipelineUnit):
method __init__ (line 422) | def __init__(self):
method process (line 429) | def process(self, pipe: FluxImagePipeline, kontext_images, tiled, tile...
class FluxImageUnit_ControlNet (line 451) | class FluxImageUnit_ControlNet(PipelineUnit):
method __init__ (line 452) | def __init__(self):
method apply_controlnet_mask_on_latents (line 459) | def apply_controlnet_mask_on_latents(self, pipe, latents, mask):
method apply_controlnet_mask_on_image (line 466) | def apply_controlnet_mask_on_image(self, pipe, image, mask):
method process (line 474) | def process(self, pipe: FluxImagePipeline, controlnet_inputs: list[Con...
class FluxImageUnit_IPAdapter (line 494) | class FluxImageUnit_IPAdapter(PipelineUnit):
method __init__ (line 495) | def __init__(self):
method process (line 503) | def process(self, pipe: FluxImagePipeline, inputs_shared, inputs_posi,...
class FluxImageUnit_EntityControl (line 523) | class FluxImageUnit_EntityControl(PipelineUnit):
method __init__ (line 524) | def __init__(self):
method encode_prompt_using_clip (line 532) | def encode_prompt_using_clip(self, prompt, text_encoder, tokenizer, ma...
method encode_prompt_using_t5 (line 543) | def encode_prompt_using_t5(self, prompt, text_encoder, tokenizer, max_...
method encode_prompt (line 554) | def encode_prompt(
method preprocess_masks (line 570) | def preprocess_masks(self, pipe, masks, height, width, dim):
method prepare_entity_inputs (line 578) | def prepare_entity_inputs(self, pipe, entity_prompts, entity_masks, wi...
method prepare_eligen (line 589) | def prepare_eligen(self, pipe, prompt_emb_nega, eligen_entity_prompts,...
method process (line 600) | def process(self, pipe: FluxImagePipeline, inputs_shared, inputs_posi,...
class FluxImageUnit_NexusGen (line 615) | class FluxImageUnit_NexusGen(PipelineUnit):
method __init__ (line 616) | def __init__(self):
method process (line 624) | def process(self, pipe: FluxImagePipeline, inputs_shared, inputs_posi,...
method get_editing_text_ids (line 648) | def get_editing_text_ids(self, latents, target_embed_height, target_em...
class FluxImageUnit_Step1x (line 671) | class FluxImageUnit_Step1x(PipelineUnit):
method __init__ (line 672) | def __init__(self):
method process (line 680) | def process(self, pipe: FluxImagePipeline, inputs_shared: dict, inputs...
class FluxImageUnit_TeaCache (line 699) | class FluxImageUnit_TeaCache(PipelineUnit):
method __init__ (line 700) | def __init__(self):
method process (line 703) | def process(self, pipe: FluxImagePipeline, num_inference_steps, tea_ca...
class FluxImageUnit_Flex (line 709) | class FluxImageUnit_Flex(PipelineUnit):
method __init__ (line 710) | def __init__(self):
method process (line 717) | def process(self, pipe: FluxImagePipeline, latents, flex_inpaint_image...
class FluxImageUnit_InfiniteYou (line 748) | class FluxImageUnit_InfiniteYou(PipelineUnit):
method __init__ (line 749) | def __init__(self):
method process (line 756) | def process(self, pipe: FluxImagePipeline, infinityou_id_image, infini...
class FluxImageUnit_ValueControl (line 765) | class FluxImageUnit_ValueControl(PipelineUnit):
method __init__ (line 766) | def __init__(self):
method add_to_text_embedding (line 776) | def add_to_text_embedding(self, prompt_emb, text_ids, value_emb):
method process (line 782) | def process(self, pipe: FluxImagePipeline, prompt_emb, text_ids, value...
class InfinitYou (line 796) | class InfinitYou(torch.nn.Module):
method __init__ (line 797) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method _detect_face (line 812) | def _detect_face(self, id_image_cv2):
method extract_arcface_bgr_embedding (line 822) | def extract_arcface_bgr_embedding(self, in_image, landmark, device):
method prepare_infinite_you (line 831) | def prepare_infinite_you(self, model, id_image, infinityou_guidance, d...
class FluxImageUnit_LoRAEncode (line 847) | class FluxImageUnit_LoRAEncode(PipelineUnit):
method __init__ (line 848) | def __init__(self):
method parse_lora_encoder_inputs (line 856) | def parse_lora_encoder_inputs(self, lora_encoder_inputs):
method load_lora (line 867) | def load_lora(self, lora_config, dtype, device):
method lora_embedding (line 873) | def lora_embedding(self, pipe, lora_encoder_inputs):
method add_to_text_embedding (line 881) | def add_to_text_embedding(self, prompt_emb, text_ids, lora_emb):
method process (line 887) | def process(self, pipe: FluxImagePipeline, inputs_shared, inputs_posi,...
class TeaCache (line 908) | class TeaCache:
method __init__ (line 909) | def __init__(self, num_inference_steps, rel_l1_thresh):
method check (line 918) | def check(self, dit: FluxDiT, hidden_states, conditioning):
method store (line 942) | def store(self, hidden_states):
method update (line 946) | def update(self, hidden_states):
class FastTileWorker (line 951) | class FastTileWorker:
method __init__ (line 952) | def __init__(self):
method build_mask (line 956) | def build_mask(self, data, is_bound):
method tiled_forward (line 974) | def tiled_forward(self, forward_fn, model_input, tile_size, tile_strid...
function model_fn_flux_image (line 1004) | def model_fn_flux_image(
FILE: diffsynth/pipelines/ltx2_audio_video.py
class LTX2AudioVideoPipeline (line 28) | class LTX2AudioVideoPipeline(BasePipeline):
method __init__ (line 30) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method from_pretrained (line 110) | def from_pretrained(
method denoise_stage (line 148) | def denoise_stage(self, inputs_shared, inputs_posi, inputs_nega, units...
method __call__ (line 168) | def __call__(
class LTX2AudioVideoUnit_PipelineChecker (line 251) | class LTX2AudioVideoUnit_PipelineChecker(PipelineUnit):
method __init__ (line 252) | def __init__(self):
method process (line 259) | def process(self, pipe: LTX2AudioVideoPipeline, inputs_shared, inputs_...
class LTX2AudioVideoUnit_ShapeChecker (line 274) | class LTX2AudioVideoUnit_ShapeChecker(PipelineUnit):
method __init__ (line 280) | def __init__(self):
method process (line 286) | def process(self, pipe: LTX2AudioVideoPipeline, height, width, num_fra...
class LTX2AudioVideoUnit_PromptEmbedder (line 297) | class LTX2AudioVideoUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 299) | def __init__(self):
method _preprocess_text (line 307) | def _preprocess_text(
method encode_prompt (line 317) | def encode_prompt(self, pipe, text, padding_side="left"):
method process (line 323) | def process(self, pipe: LTX2AudioVideoPipeline, prompt: str):
class LTX2AudioVideoUnit_NoiseInitializer (line 329) | class LTX2AudioVideoUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 330) | def __init__(self):
method process_stage (line 336) | def process_stage(self, pipe: LTX2AudioVideoPipeline, height, width, n...
method process (line 358) | def process(self, pipe: LTX2AudioVideoPipeline, height, width, num_fra...
class LTX2AudioVideoUnit_InputVideoEmbedder (line 362) | class LTX2AudioVideoUnit_InputVideoEmbedder(PipelineUnit):
method __init__ (line 363) | def __init__(self):
method process (line 370) | def process(self, pipe: LTX2AudioVideoPipeline, input_video, video_noi...
class LTX2AudioVideoUnit_InputAudioEmbedder (line 379) | class LTX2AudioVideoUnit_InputAudioEmbedder(PipelineUnit):
method __init__ (line 380) | def __init__(self):
method process (line 387) | def process(self, pipe: LTX2AudioVideoPipeline, input_audio, audio_noi...
class LTX2AudioVideoUnit_VideoRetakeEmbedder (line 401) | class LTX2AudioVideoUnit_VideoRetakeEmbedder(PipelineUnit):
method __init__ (line 402) | def __init__(self):
method process (line 409) | def process(self, pipe: LTX2AudioVideoPipeline, retake_video, height, ...
class LTX2AudioVideoUnit_AudioRetakeEmbedder (line 429) | class LTX2AudioVideoUnit_AudioRetakeEmbedder(PipelineUnit):
method __init__ (line 433) | def __init__(self):
method process (line 440) | def process(self, pipe: LTX2AudioVideoPipeline, retake_audio, seed, ra...
class LTX2AudioVideoUnit_InputImagesEmbedder (line 472) | class LTX2AudioVideoUnit_InputImagesEmbedder(PipelineUnit):
method __init__ (line 473) | def __init__(self):
method get_image_latent (line 480) | def get_image_latent(self, pipe, input_image, height, width, tiled, ti...
method apply_input_images_to_latents (line 488) | def apply_input_images_to_latents(self, latents, input_latents, input_...
method process (line 499) | def process(
class LTX2AudioVideoUnit_InContextVideoEmbedder (line 542) | class LTX2AudioVideoUnit_InContextVideoEmbedder(PipelineUnit):
method __init__ (line 543) | def __init__(self):
method check_in_context_video (line 550) | def check_in_context_video(self, pipe, in_context_video, height, width...
method process (line 565) | def process(self, pipe: LTX2AudioVideoPipeline, in_context_videos, hei...
class LTX2AudioVideoUnit_SwitchStage2 (line 590) | class LTX2AudioVideoUnit_SwitchStage2(PipelineUnit):
method __init__ (line 596) | def __init__(self):
method process (line 602) | def process(self, pipe: LTX2AudioVideoPipeline, stage_2_height, stage_...
class LTX2AudioVideoUnit_SetScheduleStage2 (line 614) | class LTX2AudioVideoUnit_SetScheduleStage2(PipelineUnit):
method __init__ (line 615) | def __init__(self):
method process (line 621) | def process(self, pipe: LTX2AudioVideoPipeline, video_latents, video_n...
class LTX2AudioVideoUnit_LatentsUpsampler (line 628) | class LTX2AudioVideoUnit_LatentsUpsampler(PipelineUnit):
method __init__ (line 629) | def __init__(self):
method process (line 636) | def process(self, pipe: LTX2AudioVideoPipeline, video_latents):
function model_fn_ltx2 (line 647) | def model_fn_ltx2(
FILE: diffsynth/pipelines/mova_audio_video.py
class MovaAudioVideoPipeline (line 25) | class MovaAudioVideoPipeline(BasePipeline):
method __init__ (line 27) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method enable_usp (line 56) | def enable_usp(self):
method from_pretrained (line 64) | def from_pretrained(
method __call__ (line 114) | def __call__(
class MovaAudioVideoUnit_ShapeChecker (line 199) | class MovaAudioVideoUnit_ShapeChecker(PipelineUnit):
method __init__ (line 200) | def __init__(self):
method process (line 206) | def process(self, pipe: MovaAudioVideoPipeline, height, width, num_fra...
class MovaAudioVideoUnit_NoiseInitializer (line 211) | class MovaAudioVideoUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 212) | def __init__(self):
method process (line 218) | def process(self, pipe: MovaAudioVideoPipeline, height, width, num_fra...
class MovaAudioVideoUnit_InputVideoEmbedder (line 229) | class MovaAudioVideoUnit_InputVideoEmbedder(PipelineUnit):
method __init__ (line 230) | def __init__(self):
method process (line 237) | def process(self, pipe: MovaAudioVideoPipeline, input_video, video_noi...
class MovaAudioVideoUnit_InputAudioEmbedder (line 247) | class MovaAudioVideoUnit_InputAudioEmbedder(PipelineUnit):
method __init__ (line 248) | def __init__(self):
method process (line 255) | def process(self, pipe: MovaAudioVideoPipeline, input_audio, audio_noi...
class MovaAudioVideoUnit_PromptEmbedder (line 268) | class MovaAudioVideoUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 269) | def __init__(self):
method encode_prompt (line 278) | def encode_prompt(self, pipe: MovaAudioVideoPipeline, prompt):
method process (line 296) | def process(self, pipe: MovaAudioVideoPipeline, prompt) -> dict:
class MovaAudioVideoUnit_ImageEmbedderVAE (line 302) | class MovaAudioVideoUnit_ImageEmbedderVAE(PipelineUnit):
method __init__ (line 303) | def __init__(self):
method process (line 310) | def process(self, pipe: MovaAudioVideoPipeline, input_image, end_image...
class MovaAudioVideoUnit_UnifiedSequenceParallel (line 337) | class MovaAudioVideoUnit_UnifiedSequenceParallel(PipelineUnit):
method __init__ (line 338) | def __init__(self):
method process (line 341) | def process(self, pipe: MovaAudioVideoPipeline):
function model_fn_mova_audio_video (line 347) | def model_fn_mova_audio_video(
FILE: diffsynth/pipelines/qwen_image.py
class QwenImagePipeline (line 24) | class QwenImagePipeline(BasePipeline):
method __init__ (line 26) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method from_pretrained (line 62) | def from_pretrained(
method __call__ (line 99) | def __call__(
class QwenImageBlockwiseMultiControlNet (line 198) | class QwenImageBlockwiseMultiControlNet(torch.nn.Module):
method __init__ (line 199) | def __init__(self, models: list[QwenImageBlockWiseControlNet]):
method preprocess (line 208) | def preprocess(self, controlnet_inputs: list[ControlNetInput], conditi...
method blockwise_forward (line 216) | def blockwise_forward(self, image, conditionings: list[torch.Tensor], ...
class QwenImageUnit_ShapeChecker (line 227) | class QwenImageUnit_ShapeChecker(PipelineUnit):
method __init__ (line 228) | def __init__(self):
method process (line 234) | def process(self, pipe: QwenImagePipeline, height, width):
class QwenImageUnit_NoiseInitializer (line 240) | class QwenImageUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 241) | def __init__(self):
method process (line 247) | def process(self, pipe: QwenImagePipeline, height, width, seed, rand_d...
class QwenImageUnit_InputImageEmbedder (line 256) | class QwenImageUnit_InputImageEmbedder(PipelineUnit):
method __init__ (line 257) | def __init__(self):
method process (line 264) | def process(self, pipe: QwenImagePipeline, input_image, noise, tiled, ...
class QwenImageUnit_LayerInputImageEmbedder (line 284) | class QwenImageUnit_LayerInputImageEmbedder(PipelineUnit):
method __init__ (line 285) | def __init__(self):
method process (line 292) | def process(self, pipe: QwenImagePipeline, layer_input_image, tiled, t...
class QwenImageUnit_Inpaint (line 301) | class QwenImageUnit_Inpaint(PipelineUnit):
method __init__ (line 302) | def __init__(self):
method process (line 308) | def process(self, pipe: QwenImagePipeline, inpaint_mask, height, width...
class QwenImageUnit_PromptEmbedder (line 320) | class QwenImageUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 321) | def __init__(self):
method extract_masked_hidden (line 331) | def extract_masked_hidden(self, hidden_states: torch.Tensor, mask: tor...
method calculate_dimensions (line 338) | def calculate_dimensions(self, target_area, ratio):
method resize_image (line 345) | def resize_image(self, image, target_area=384*384):
method encode_prompt (line 349) | def encode_prompt(self, pipe: QwenImagePipeline, prompt):
method encode_prompt_edit (line 361) | def encode_prompt_edit(self, pipe: QwenImagePipeline, prompt, edit_ima...
method encode_prompt_edit_multi (line 371) | def encode_prompt_edit_multi(self, pipe: QwenImagePipeline, prompt, ed...
method process (line 384) | def process(self, pipe: QwenImagePipeline, prompt, edit_image=None) ->...
class QwenImageUnit_EntityControl (line 404) | class QwenImageUnit_EntityControl(PipelineUnit):
method __init__ (line 405) | def __init__(self):
method extract_masked_hidden (line 413) | def extract_masked_hidden(self, hidden_states: torch.Tensor, mask: tor...
method get_prompt_emb (line 420) | def get_prompt_emb(self, pipe: QwenImagePipeline, prompt) -> dict:
method preprocess_masks (line 440) | def preprocess_masks(self, pipe, masks, height, width, dim):
method prepare_entity_inputs (line 448) | def prepare_entity_inputs(self, pipe, entity_prompts, entity_masks, wi...
method prepare_eligen (line 458) | def prepare_eligen(self, pipe, prompt_emb_nega, eligen_entity_prompts,...
method process (line 470) | def process(self, pipe: QwenImagePipeline, inputs_shared, inputs_posi,...
class QwenImageUnit_BlockwiseControlNet (line 486) | class QwenImageUnit_BlockwiseControlNet(PipelineUnit):
method __init__ (line 487) | def __init__(self):
method apply_controlnet_mask_on_latents (line 494) | def apply_controlnet_mask_on_latents(self, pipe, latents, mask):
method apply_controlnet_mask_on_image (line 501) | def apply_controlnet_mask_on_image(self, pipe, image, mask):
method process (line 509) | def process(self, pipe: QwenImagePipeline, blockwise_controlnet_inputs...
class QwenImageUnit_EditImageEmbedder (line 529) | class QwenImageUnit_EditImageEmbedder(PipelineUnit):
method __init__ (line 530) | def __init__(self):
method calculate_dimensions (line 538) | def calculate_dimensions(self, target_area, ratio):
method edit_image_auto_resize (line 547) | def edit_image_auto_resize(self, edit_image):
method process (line 552) | def process(self, pipe: QwenImagePipeline, edit_image, tiled, tile_siz...
class QwenImageUnit_Image2LoRAEncode (line 572) | class QwenImageUnit_Image2LoRAEncode(PipelineUnit):
method __init__ (line 573) | def __init__(self):
method extract_masked_hidden (line 583) | def extract_masked_hidden(self, hidden_states: torch.Tensor, mask: tor...
method encode_prompt_edit (line 590) | def encode_prompt_edit(self, pipe: QwenImagePipeline, prompt, edit_ima...
method encode_images_using_siglip2 (line 604) | def encode_images_using_siglip2(self, pipe: QwenImagePipeline, images:...
method encode_images_using_dinov3 (line 613) | def encode_images_using_dinov3(self, pipe: QwenImagePipeline, images: ...
method encode_images_using_qwenvl (line 622) | def encode_images_using_qwenvl(self, pipe: QwenImagePipeline, images: ...
method encode_images (line 631) | def encode_images(self, pipe: QwenImagePipeline, images: list[Image.Im...
method process (line 647) | def process(self, pipe: QwenImagePipeline, image2lora_images):
class QwenImageUnit_Image2LoRADecode (line 654) | class QwenImageUnit_Image2LoRADecode(PipelineUnit):
method __init__ (line 655) | def __init__(self):
method process (line 662) | def process(self, pipe: QwenImagePipeline, image2lora_x, image2lora_re...
class QwenImageUnit_ContextImageEmbedder (line 682) | class QwenImageUnit_ContextImageEmbedder(PipelineUnit):
method __init__ (line 683) | def __init__(self):
method process (line 690) | def process(self, pipe: QwenImagePipeline, context_image, height, widt...
function model_fn_qwen_image (line 701) | def model_fn_qwen_image(
FILE: diffsynth/pipelines/wan_video.py
class WanVideoPipeline (line 32) | class WanVideoPipeline(BasePipeline):
method __init__ (line 34) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method enable_usp (line 88) | def enable_usp(self):
method from_pretrained (line 103) | def from_pretrained(
method __call__ (line 181) | def __call__(
class WanVideoUnit_ShapeChecker (line 354) | class WanVideoUnit_ShapeChecker(PipelineUnit):
method __init__ (line 355) | def __init__(self):
method process (line 361) | def process(self, pipe: WanVideoPipeline, height, width, num_frames):
class WanVideoUnit_NoiseInitializer (line 367) | class WanVideoUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 368) | def __init__(self):
method process (line 374) | def process(self, pipe: WanVideoPipeline, height, width, num_frames, s...
class WanVideoUnit_InputVideoEmbedder (line 387) | class WanVideoUnit_InputVideoEmbedder(PipelineUnit):
method __init__ (line 388) | def __init__(self):
method process (line 395) | def process(self, pipe: WanVideoPipeline, input_video, noise, tiled, t...
class WanVideoUnit_PromptEmbedder (line 418) | class WanVideoUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 419) | def __init__(self):
method encode_prompt (line 428) | def encode_prompt(self, pipe: WanVideoPipeline, prompt):
method process (line 438) | def process(self, pipe: WanVideoPipeline, prompt, positive) -> dict:
class WanVideoUnit_ImageEmbedderCLIP (line 445) | class WanVideoUnit_ImageEmbedderCLIP(PipelineUnit):
method __init__ (line 446) | def __init__(self):
method process (line 453) | def process(self, pipe: WanVideoPipeline, input_image, end_image, heig...
class WanVideoUnit_ImageEmbedderVAE (line 468) | class WanVideoUnit_ImageEmbedderVAE(PipelineUnit):
method __init__ (line 469) | def __init__(self):
method process (line 476) | def process(self, pipe: WanVideoPipeline, input_image, end_image, num_...
class WanVideoUnit_ImageEmbedderFused (line 503) | class WanVideoUnit_ImageEmbedderFused(PipelineUnit):
method __init__ (line 507) | def __init__(self):
method process (line 514) | def process(self, pipe: WanVideoPipeline, input_image, latents, height...
class WanVideoUnit_FunControl (line 525) | class WanVideoUnit_FunControl(PipelineUnit):
method __init__ (line 526) | def __init__(self):
method process (line 533) | def process(self, pipe: WanVideoPipeline, control_video, num_frames, h...
class WanVideoUnit_FunReference (line 551) | class WanVideoUnit_FunReference(PipelineUnit):
method __init__ (line 552) | def __init__(self):
method process (line 559) | def process(self, pipe: WanVideoPipeline, reference_image, height, wid...
class WanVideoUnit_FunCameraControl (line 574) | class WanVideoUnit_FunCameraControl(PipelineUnit):
method __init__ (line 575) | def __init__(self):
method process (line 582) | def process(self, pipe: WanVideoPipeline, height, width, num_frames, c...
class WanVideoUnit_SpeedControl (line 625) | class WanVideoUnit_SpeedControl(PipelineUnit):
method __init__ (line 626) | def __init__(self):
method process (line 632) | def process(self, pipe: WanVideoPipeline, motion_bucket_id):
class WanVideoUnit_VACE (line 640) | class WanVideoUnit_VACE(PipelineUnit):
method __init__ (line 641) | def __init__(self):
method process (line 648) | def process(
class WanVideoUnit_VAP (line 703) | class WanVideoUnit_VAP(PipelineUnit):
method __init__ (line 704) | def __init__(self):
method encode_prompt (line 712) | def encode_prompt(self, pipe: WanVideoPipeline, prompt):
method process (line 722) | def process(self, pipe: WanVideoPipeline, inputs_shared, inputs_posi, ...
class WanVideoUnit_UnifiedSequenceParallel (line 781) | class WanVideoUnit_UnifiedSequenceParallel(PipelineUnit):
method __init__ (line 782) | def __init__(self):
method process (line 785) | def process(self, pipe: WanVideoPipeline):
class WanVideoUnit_TeaCache (line 793) | class WanVideoUnit_TeaCache(PipelineUnit):
method __init__ (line 794) | def __init__(self):
method process (line 802) | def process(self, pipe: WanVideoPipeline, num_inference_steps, tea_cac...
class WanVideoUnit_CfgMerger (line 809) | class WanVideoUnit_CfgMerger(PipelineUnit):
method __init__ (line 810) | def __init__(self):
method process (line 814) | def process(self, pipe: WanVideoPipeline, inputs_shared, inputs_posi, ...
class WanVideoUnit_S2V (line 830) | class WanVideoUnit_S2V(PipelineUnit):
method __init__ (line 831) | def __init__(self):
method process_audio (line 839) | def process_audio(self, pipe: WanVideoPipeline, input_audio, audio_sam...
method process_motion_latents (line 849) | def process_motion_latents(self, pipe: WanVideoPipeline, height, width...
method process_pose_cond (line 864) | def process_pose_cond(self, pipe: WanVideoPipeline, s2v_pose_video, nu...
method process (line 887) | def process(self, pipe: WanVideoPipeline, inputs_shared, inputs_posi, ...
method pre_calculate_audio_pose (line 903) | def pre_calculate_audio_pose(pipe: WanVideoPipeline, input_audio=None,...
class WanVideoPostUnit_S2V (line 914) | class WanVideoPostUnit_S2V(PipelineUnit):
method __init__ (line 915) | def __init__(self):
method process (line 918) | def process(self, pipe: WanVideoPipeline, latents, motion_latents, dro...
class WanVideoUnit_AnimateVideoSplit (line 925) | class WanVideoUnit_AnimateVideoSplit(PipelineUnit):
method __init__ (line 926) | def __init__(self):
method process (line 932) | def process(self, pipe: WanVideoPipeline, input_video, animate_pose_vi...
class WanVideoUnit_AnimatePoseLatents (line 946) | class WanVideoUnit_AnimatePoseLatents(PipelineUnit):
method __init__ (line 947) | def __init__(self):
method process (line 954) | def process(self, pipe: WanVideoPipeline, animate_pose_video, tiled, t...
class WanVideoUnit_AnimateFacePixelValues (line 963) | class WanVideoUnit_AnimateFacePixelValues(PipelineUnit):
method __init__ (line 964) | def __init__(self):
method process (line 971) | def process(self, pipe: WanVideoPipeline, inputs_shared, inputs_posi, ...
class WanVideoUnit_AnimateInpaint (line 979) | class WanVideoUnit_AnimateInpaint(PipelineUnit):
method __init__ (line 980) | def __init__(self):
method get_i2v_mask (line 987) | def get_i2v_mask(self, lat_t, lat_h, lat_w, mask_len=1, mask_pixel_val...
method process (line 998) | def process(self, pipe: WanVideoPipeline, animate_inpaint_video, anima...
class WanVideoUnit_LongCatVideo (line 1023) | class WanVideoUnit_LongCatVideo(PipelineUnit):
method __init__ (line 1024) | def __init__(self):
method process (line 1031) | def process(self, pipe: WanVideoPipeline, longcat_video):
class WanVideoUnit_WanToDance_ProcessInputs (line 1040) | class WanVideoUnit_WanToDance_ProcessInputs(PipelineUnit):
method __init__ (line 1041) | def __init__(self):
method get_music_base_feature (line 1046) | def get_music_base_feature(self, music_path, fps=30):
method process (line 1078) | def process(self, pipe: WanVideoPipeline, inputs_shared, inputs_posi, ...
class WanVideoUnit_WanToDance_RefImageEmbedder (line 1086) | class WanVideoUnit_WanToDance_RefImageEmbedder(PipelineUnit):
method __init__ (line 1087) | def __init__(self):
method process (line 1094) | def process(self, pipe: WanVideoPipeline, wantodance_reference_image, ...
class WanVideoUnit_WanToDance_ImageKeyframesEmbedder (line 1106) | class WanVideoUnit_WanToDance_ImageKeyframesEmbedder(PipelineUnit):
method __init__ (line 1107) | def __init__(self):
method process (line 1114) | def process(self, pipe: WanVideoPipeline, wantodance_keyframes, wantod...
class TeaCache (line 1145) | class TeaCache:
method __init__ (line 1146) | def __init__(self, num_inference_steps, rel_l1_thresh, model_id):
method check (line 1166) | def check(self, dit: WanModel, x, t_mod):
method store (line 1188) | def store(self, hidden_states):
method update (line 1192) | def update(self, hidden_states):
class TemporalTiler_BCTHW (line 1198) | class TemporalTiler_BCTHW:
method __init__ (line 1199) | def __init__(self):
method build_1d_mask (line 1202) | def build_1d_mask(self, length, left_bound, right_bound, border_width):
method build_mask (line 1214) | def build_mask(self, data, is_bound, border_width):
method run (line 1220) | def run(self, model_fn, sliding_window_size, sliding_window_stride, co...
function wantodance_get_single_freqs (line 1250) | def wantodance_get_single_freqs(freqs, frame_num, fps):
function model_fn_wan_video (line 1267) | def model_fn_wan_video(
function model_fn_longcat_video (line 1596) | def model_fn_longcat_video(
function model_fn_wans2v (line 1626) | def model_fn_wans2v(
FILE: diffsynth/pipelines/z_image.py
class ZImagePipeline (line 27) | class ZImagePipeline(BasePipeline):
method __init__ (line 29) | def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16):
method from_pretrained (line 60) | def from_pretrained(
method __call__ (line 94) | def __call__(
class ZImageUnit_ShapeChecker (line 167) | class ZImageUnit_ShapeChecker(PipelineUnit):
method __init__ (line 168) | def __init__(self):
method process (line 174) | def process(self, pipe: ZImagePipeline, height, width):
class ZImageUnit_PromptEmbedder (line 179) | class ZImageUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 180) | def __init__(self):
method encode_prompt (line 190) | def encode_prompt(
method encode_prompt_omni (line 236) | def encode_prompt_omni(
method process (line 300) | def process(self, pipe: ZImagePipeline, prompt, edit_image):
class ZImageUnit_NoiseInitializer (line 313) | class ZImageUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 314) | def __init__(self):
method process (line 320) | def process(self, pipe: ZImagePipeline, height, width, seed, rand_devi...
class ZImageUnit_InputImageEmbedder (line 325) | class ZImageUnit_InputImageEmbedder(PipelineUnit):
method __init__ (line 326) | def __init__(self):
method process (line 333) | def process(self, pipe: ZImagePipeline, input_image, noise):
class ZImageUnit_EditImageAutoResize (line 346) | class ZImageUnit_EditImageAutoResize(PipelineUnit):
method __init__ (line 347) | def __init__(self):
method process (line 353) | def process(self, pipe: ZImagePipeline, edit_image, edit_image_auto_re...
class ZImageUnit_EditImageEmbedderSiglip (line 365) | class ZImageUnit_EditImageEmbedderSiglip(PipelineUnit):
method __init__ (line 366) | def __init__(self):
method process (line 373) | def process(self, pipe: ZImagePipeline, edit_image):
class ZImageUnit_EditImageEmbedderVAE (line 385) | class ZImageUnit_EditImageEmbedderVAE(PipelineUnit):
method __init__ (line 386) | def __init__(self):
method process (line 393) | def process(self, pipe: ZImagePipeline, edit_image):
class ZImageUnit_PAIControlNet (line 406) | class ZImageUnit_PAIControlNet(PipelineUnit):
method __init__ (line 407) | def __init__(self):
method process (line 414) | def process(self, pipe: ZImagePipeline, controlnet_inputs: List[Contro...
function model_fn_z_image (line 446) | def model_fn_z_image(
class ZImageUnit_Image2LoRAEncode (line 501) | class ZImageUnit_Image2LoRAEncode(PipelineUnit):
method __init__ (line 502) | def __init__(self):
method encode_images_using_siglip2 (line 511) | def encode_images_using_siglip2(self, pipe: ZImagePipeline, images: li...
method encode_images_using_dinov3 (line 520) | def encode_images_using_dinov3(self, pipe: ZImagePipeline, images: lis...
method encode_images (line 529) | def encode_images(self, pipe: ZImagePipeline, images: list[Image.Image]):
method process (line 539) | def process(self, pipe: ZImagePipeline, image2lora_images):
class ZImageUnit_Image2LoRADecode (line 546) | class ZImageUnit_Image2LoRADecode(PipelineUnit):
method __init__ (line 547) | def __init__(self):
method process (line 554) | def process(self, pipe: ZImagePipeline, image2lora_x):
function model_fn_z_image_turbo (line 566) | def model_fn_z_image_turbo(
function apply_npu_patch (line 675) | def apply_npu_patch(enable_npu_patch: bool=True):
FILE: diffsynth/utils/controlnet/annotator.py
class Annotator (line 9) | class Annotator:
method __init__ (line 10) | def __init__(self, processor_id: Processor_id, model_path="models/Anno...
method to (line 43) | def to(self,device):
method __call__ (line 48) | def __call__(self, image, mask=None):
FILE: diffsynth/utils/controlnet/controlnet_input.py
class ControlNetInput (line 6) | class ControlNetInput:
FILE: diffsynth/utils/data/__init__.py
class LowMemoryVideo (line 9) | class LowMemoryVideo:
method __init__ (line 10) | def __init__(self, file_name):
method __len__ (line 13) | def __len__(self):
method __getitem__ (line 16) | def __getitem__(self, item):
method __del__ (line 19) | def __del__(self):
function split_file_name (line 23) | def split_file_name(file_name):
function search_for_images (line 42) | def search_for_images(folder):
class LowMemoryImageFolder (line 50) | class LowMemoryImageFolder:
method __init__ (line 51) | def __init__(self, folder, file_list=None):
method __len__ (line 57) | def __len__(self):
method __getitem__ (line 60) | def __getitem__(self, item):
method __del__ (line 63) | def __del__(self):
function crop_and_resize (line 67) | def crop_and_resize(image, height, width):
class VideoData (line 83) | class VideoData:
method __init__ (line 84) | def __init__(self, video_file=None, image_folder=None, height=None, wi...
method raw_data (line 96) | def raw_data(self):
method set_length (line 102) | def set_length(self, length):
method set_shape (line 105) | def set_shape(self, height, width):
method __len__ (line 109) | def __len__(self):
method shape (line 115) | def shape(self):
method __getitem__ (line 122) | def __getitem__(self, item):
method __del__ (line 130) | def __del__(self):
method save_images (line 133) | def save_images(self, folder):
function save_video (line 140) | def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
function save_frames (line 147) | def save_frames(frames, save_path):
function merge_video_audio (line 153) | def merge_video_audio(video_path: str, audio_path: str):
function save_video_with_audio (line 215) | def save_video_with_audio(frames, save_path, audio_path, fps=16, quality...
FILE: diffsynth/utils/data/audio.py
function convert_to_mono (line 5) | def convert_to_mono(audio_tensor: torch.Tensor) -> torch.Tensor:
function convert_to_stereo (line 13) | def convert_to_stereo(audio_tensor: torch.Tensor) -> torch.Tensor:
function resample_waveform (line 23) | def resample_waveform(waveform: torch.Tensor, source_rate: int, target_r...
function read_audio_with_torchcodec (line 31) | def read_audio_with_torchcodec(
function read_audio (line 55) | def read_audio(
function save_audio (line 90) | def save_audio(waveform: torch.Tensor, sample_rate: int, save_path: str,...
FILE: diffsynth/utils/data/audio_video.py
function _resample_audio (line 9) | def _resample_audio(
function _write_audio (line 38) | def _write_audio(
function _prepare_audio_stream (line 61) | def _prepare_audio_stream(container: av.container.Container, audio_sampl...
function write_video_audio (line 79) | def write_video_audio(
FILE: diffsynth/utils/data/media_io_ltx2.py
function encode_single_frame (line 7) | def encode_single_frame(output_file: str, image_array: np.ndarray, crf: ...
function decode_single_frame (line 24) | def decode_single_frame(video_file: str) -> np.array:
function ltx2_preprocess (line 34) | def ltx2_preprocess(image: np.array, crf: float = 33) -> np.array:
FILE: diffsynth/utils/lora/flux.py
class FluxLoRALoader (line 5) | class FluxLoRALoader(GeneralLoRALoader):
method __init__ (line 6) | def __init__(self, device="cpu", torch_dtype=torch.float32):
method fuse_lora_to_base_model (line 81) | def fuse_lora_to_base_model(self, model: torch.nn.Module, state_dict_l...
method convert_state_dict (line 84) | def convert_state_dict(self, state_dict):
class FluxLoRAConverter (line 209) | class FluxLoRAConverter:
method __init__ (line 210) | def __init__(self):
method align_to_opensource_format (line 214) | def align_to_opensource_format(state_dict, alpha=None):
method align_to_diffsynth_format (line 258) | def align_to_diffsynth_format(state_dict):
FILE: diffsynth/utils/lora/general.py
class GeneralLoRALoader (line 4) | class GeneralLoRALoader:
method __init__ (line 5) | def __init__(self, device="cpu", torch_dtype=torch.float32):
method get_name_dict (line 10) | def get_name_dict(self, lora_state_dict):
method convert_state_dict (line 37) | def convert_state_dict(self, state_dict, suffix=".weight"):
method fuse_lora_to_base_model (line 52) | def fuse_lora_to_base_model(self, model: torch.nn.Module, state_dict, ...
FILE: diffsynth/utils/lora/merge.py
function merge_lora_weight (line 5) | def merge_lora_weight(tensors_A, tensors_B):
function merge_lora (line 11) | def merge_lora(loras: List[Dict[str, torch.Tensor]], alpha=1):
FILE: diffsynth/utils/lora/reset_rank.py
function decomposite (line 3) | def decomposite(tensor_A, tensor_B, rank):
function reset_lora_rank (line 11) | def reset_lora_rank(lora, rank):
FILE: diffsynth/utils/ses/ses.py
function split_dwt (line 7) | def split_dwt(z_tensor_cpu, wavelet_name, dwt_level):
function reconstruct_dwt (line 27) | def reconstruct_dwt(c_low_tensor_cpu, c_high_coeffs, wavelet_name, origi...
function ses_search (line 46) | def ses_search(
FILE: diffsynth/utils/state_dict_converters/anima_dit.py
function AnimaDiTStateDictConverter (line 1) | def AnimaDiTStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux2_text_encoder.py
function Flux2TextEncoderStateDictConverter (line 1) | def Flux2TextEncoderStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux_controlnet.py
function FluxControlNetStateDictConverter (line 4) | def FluxControlNetStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux_dit.py
function FluxDiTStateDictConverter (line 4) | def FluxDiTStateDictConverter(state_dict):
function FluxDiTStateDictConverterFromDiffusers (line 95) | def FluxDiTStateDictConverterFromDiffusers(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux_infiniteyou.py
function FluxInfiniteYouImageProjectorStateDictConverter (line 1) | def FluxInfiniteYouImageProjectorStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux_ipadapter.py
function FluxIpAdapterStateDictConverter (line 1) | def FluxIpAdapterStateDictConverter(state_dict):
function SiglipStateDictConverter (line 27) | def SiglipStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py
function FluxTextEncoderClipStateDictConverter (line 1) | def FluxTextEncoderClipStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py
function FluxTextEncoderT5StateDictConverter (line 1) | def FluxTextEncoderT5StateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/flux_vae.py
function FluxVAEEncoderStateDictConverter (line 1) | def FluxVAEEncoderStateDictConverter(state_dict):
function FluxVAEDecoderStateDictConverter (line 118) | def FluxVAEDecoderStateDictConverter(state_dict):
function FluxVAEEncoderStateDictConverterDiffusers (line 267) | def FluxVAEEncoderStateDictConverterDiffusers(state_dict):
function FluxVAEDecoderStateDictConverterDiffusers (line 326) | def FluxVAEDecoderStateDictConverterDiffusers(state_dict):
FILE: diffsynth/utils/state_dict_converters/ltx2_audio_vae.py
function LTX2AudioEncoderStateDictConverter (line 1) | def LTX2AudioEncoderStateDictConverter(state_dict):
function LTX2AudioDecoderStateDictConverter (line 14) | def LTX2AudioDecoderStateDictConverter(state_dict):
function LTX2VocoderStateDictConverter (line 26) | def LTX2VocoderStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/ltx2_dit.py
function LTXModelStateDictConverter (line 1) | def LTXModelStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/ltx2_text_encoder.py
function LTX2TextEncoderStateDictConverter (line 1) | def LTX2TextEncoderStateDictConverter(state_dict):
function LTX2TextEncoderPostModulesStateDictConverter (line 19) | def LTX2TextEncoderPostModulesStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/ltx2_video_vae.py
function LTX2VideoEncoderStateDictConverter (line 1) | def LTX2VideoEncoderStateDictConverter(state_dict):
function LTX2VideoDecoderStateDictConverter (line 14) | def LTX2VideoDecoderStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/nexus_gen.py
function NexusGenAutoregressiveModelStateDictConverter (line 1) | def NexusGenAutoregressiveModelStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/nexus_gen_projector.py
function NexusGenMergerStateDictConverter (line 1) | def NexusGenMergerStateDictConverter(state_dict):
function NexusGenAdapterStateDictConverter (line 10) | def NexusGenAdapterStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py
function QwenImageTextEncoderStateDictConverter (line 1) | def QwenImageTextEncoderStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/step1x_connector.py
function Qwen2ConnectorStateDictConverter (line 1) | def Qwen2ConnectorStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py
function WanAnimateAdapterStateDictConverter (line 1) | def WanAnimateAdapterStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/wan_video_dit.py
function WanVideoDiTFromDiffusers (line 1) | def WanVideoDiTFromDiffusers(state_dict):
function WanVideoDiTStateDictConverter (line 72) | def WanVideoDiTStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/wan_video_image_encoder.py
function WanImageEncoderStateDictConverter (line 1) | def WanImageEncoderStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/wan_video_mot.py
function WanVideoMotStateDictConverter (line 1) | def WanVideoMotStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/wan_video_vace.py
function VaceWanModelDictConverter (line 1) | def VaceWanModelDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/wan_video_vae.py
function WanVideoVAEStateDictConverter (line 1) | def WanVideoVAEStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py
function WanS2VAudioEncoderStateDictConverter (line 1) | def WanS2VAudioEncoderStateDictConverter(state_dict):
FILE: diffsynth/utils/state_dict_converters/z_image_text_encoder.py
function ZImageTextEncoderStateDictConverter (line 1) | def ZImageTextEncoderStateDictConverter(state_dict):
FILE: diffsynth/utils/xfuser/xdit_context_parallel.py
function initialize_usp (line 15) | def initialize_usp(device_type):
function sinusoidal_embedding_1d (line 28) | def sinusoidal_embedding_1d(dim, position):
function pad_freqs (line 34) | def pad_freqs(original_tensor, target_len):
function rope_apply (line 49) | def rope_apply(x, freqs, num_heads):
function usp_dit_forward (line 64) | def usp_dit_forward(self,
function usp_attn_forward (line 120) | def usp_attn_forward(self, x, freqs):
function get_current_chunk (line 149) | def get_current_chunk(x, dim=1):
function gather_all_chunks (line 167) | def gather_all_chunks(x, seq_len=None, dim=1):
FILE: docs/en/Research_Tutorial/train_from_scratch.py
class AAAPositionalEmbedding (line 16) | class AAAPositionalEmbedding(torch.nn.Module):
method __init__ (line 17) | def __init__(self, height=16, width=16, dim=1024):
method forward (line 22) | def forward(self, image, text):
class AAABlock (line 33) | class AAABlock(torch.nn.Module):
method __init__ (line 34) | def __init__(self, dim=1024, num_heads=32):
method attention (line 50) | def attention(self, emb, pos_emb):
method feed_forward (line 61) | def feed_forward(self, emb, pos_emb):
method forward (line 66) | def forward(self, emb, pos_emb, t_emb):
class AAADiT (line 73) | class AAADiT(torch.nn.Module):
method __init__ (line 74) | def __init__(self, dim=1024):
method forward (line 83) | def forward(
class AAAImagePipeline (line 111) | class AAAImagePipeline(BasePipeline):
method __init__ (line 112) | def __init__(self, device="cuda", torch_dtype=torch.bfloat16):
method from_pretrained (line 131) | def from_pretrained(
method __call__ (line 155) | def __call__(
class AAAUnit_PromptEmbedder (line 211) | class AAAUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 212) | def __init__(self):
method process (line 222) | def process(self, pipe: AAAImagePipeline, prompt):
class AAAUnit_NoiseInitializer (line 236) | class AAAUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 237) | def __init__(self):
method process (line 243) | def process(self, pipe: AAAImagePipeline, height, width, seed, rand_de...
class AAAUnit_InputImageEmbedder (line 248) | class AAAUnit_InputImageEmbedder(PipelineUnit):
method __init__ (line 249) | def __init__(self):
method process (line 256) | def process(self, pipe: AAAImagePipeline, input_image, noise):
function model_fn_aaa (line 269) | def model_fn_aaa(
class AAATrainingModule (line 288) | class AAATrainingModule(DiffusionTrainingModule):
method __init__ (line 289) | def __init__(self, device):
method forward (line 304) | def forward(self, data):
FILE: docs/en/conf.py
function get_version (line 29) | def get_version():
FILE: docs/zh/Research_Tutorial/train_from_scratch.py
class AAAPositionalEmbedding (line 16) | class AAAPositionalEmbedding(torch.nn.Module):
method __init__ (line 17) | def __init__(self, height=16, width=16, dim=1024):
method forward (line 22) | def forward(self, image, text):
class AAABlock (line 33) | class AAABlock(torch.nn.Module):
method __init__ (line 34) | def __init__(self, dim=1024, num_heads=32):
method attention (line 50) | def attention(self, emb, pos_emb):
method feed_forward (line 61) | def feed_forward(self, emb, pos_emb):
method forward (line 66) | def forward(self, emb, pos_emb, t_emb):
class AAADiT (line 73) | class AAADiT(torch.nn.Module):
method __init__ (line 74) | def __init__(self, dim=1024):
method forward (line 83) | def forward(
class AAAImagePipeline (line 111) | class AAAImagePipeline(BasePipeline):
method __init__ (line 112) | def __init__(self, device="cuda", torch_dtype=torch.bfloat16):
method from_pretrained (line 131) | def from_pretrained(
method __call__ (line 155) | def __call__(
class AAAUnit_PromptEmbedder (line 211) | class AAAUnit_PromptEmbedder(PipelineUnit):
method __init__ (line 212) | def __init__(self):
method process (line 222) | def process(self, pipe: AAAImagePipeline, prompt):
class AAAUnit_NoiseInitializer (line 236) | class AAAUnit_NoiseInitializer(PipelineUnit):
method __init__ (line 237) | def __init__(self):
method process (line 243) | def process(self, pipe: AAAImagePipeline, height, width, seed, rand_de...
class AAAUnit_InputImageEmbedder (line 248) | class AAAUnit_InputImageEmbedder(PipelineUnit):
method __init__ (line 249) | def __init__(self):
method process (line 256) | def process(self, pipe: AAAImagePipeline, input_image, noise):
function model_fn_aaa (line 269) | def model_fn_aaa(
class AAATrainingModule (line 288) | class AAATrainingModule(DiffusionTrainingModule):
method __init__ (line 289) | def __init__(self, device):
method forward (line 304) | def forward(self, data):
FILE: docs/zh/conf.py
function get_version (line 29) | def get_version():
FILE: examples/anima/model_training/train.py
class AnimaTrainingModule (line 8) | class AnimaTrainingModule(DiffusionTrainingModule):
method __init__ (line 9) | def __init__(
method get_pipeline_inputs (line 55) | def get_pipeline_inputs(self, data):
method forward (line 74) | def forward(self, data, inputs=None):
function anima_parser (line 83) | def anima_parser():
FILE: examples/dev_tools/fix_path.py
function read_file (line 4) | def read_file(path):
function get_files (line 9) | def get_files(files, path):
function fix_path (line 16) | def fix_path(doc_root_path):
FILE: examples/dev_tools/unit_test.py
function script_is_processed (line 5) | def script_is_processed(output_path, script):
function filter_unprocessed_tasks (line 9) | def filter_unprocessed_tasks(script_path):
function run_inference (line 21) | def run_inference(script_path):
function run_tasks_on_single_GPU (line 36) | def run_tasks_on_single_GPU(script_path, tasks, gpu_id, num_gpu):
function run_train_multi_GPU (line 52) | def run_train_multi_GPU(script_path):
function run_train_single_GPU (line 65) | def run_train_single_GPU(script_path):
function move_files (line 74) | def move_files(prefix, target_folder):
function test_qwen_image (line 80) | def test_qwen_image():
function test_wan (line 89) | def test_wan():
function test_flux (line 102) | def test_flux():
function test_z_image (line 111) | def test_z_image():
FILE: examples/flux/model_inference/FLUX.1-dev-EliGen.py
function visualize_masks (line 8) | def visualize_masks(image, masks, mask_prompts, output_path, font_size=3...
function example (line 65) | def example(pipe, seeds, example_id, global_prompt, entity_p
Condensed preview — 841 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (4,296K chars).
[
{
"path": ".github/workflows/publish.yaml",
"chars": 750,
"preview": "name: release\n\non:\n push:\n tags:\n - 'v**'\n\nconcurrency:\n group: ${{ github.workflow }}-${{ github.ref }}-publi"
},
{
"path": ".gitignore",
"chars": 3196,
"preview": "/data\n/models\n/scripts\n/diffusers\n/.vscode\n*.pkl\n*.safetensors\n*.pth\n*.ckpt\n*.pt\n*.bin\n*.DS_Store\n*.msc\n*.mv\nlog*.txt\n\n#"
},
{
"path": "LICENSE",
"chars": 11347,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 126758,
"preview": "# DiffSynth-Studio\n\n<a href=\"https://github.com/modelscope/DiffSynth-Studio\"><img src=\".github/workflows/logo.gif\" title"
},
{
"path": "README_zh.md",
"chars": 113852,
"preview": "# DiffSynth-Studio\n\n<a href=\"https://github.com/modelscope/DiffSynth-Studio\"><img src=\".github/workflows/logo.gif\" title"
},
{
"path": "diffsynth/__init__.py",
"chars": 20,
"preview": "from .core import *\n"
},
{
"path": "diffsynth/configs/__init__.py",
"chars": 132,
"preview": "from .model_configs import MODEL_CONFIGS\nfrom .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS, VERSION_C"
},
{
"path": "diffsynth/configs/model_configs.py",
"chars": 55835,
"preview": "qwen_image_series = [\n {\n # Example: ModelConfig(model_id=\"Qwen/Qwen-Image\", origin_file_pattern=\"text_encoder"
},
{
"path": "diffsynth/configs/vram_management_module_maps.py",
"chars": 19767,
"preview": "flux_general_vram_config = {\n \"torch.nn.Linear\": \"diffsynth.core.vram.layers.AutoWrappedLinear\",\n \"torch.nn.Embedd"
},
{
"path": "diffsynth/core/__init__.py",
"chars": 133,
"preview": "from .attention import *\nfrom .data import *\nfrom .gradient import *\nfrom .loader import *\nfrom .vram import *\nfrom .dev"
},
{
"path": "diffsynth/core/attention/__init__.py",
"chars": 41,
"preview": "from .attention import attention_forward\n"
},
{
"path": "diffsynth/core/attention/attention.py",
"chars": 5756,
"preview": "import torch, os\nfrom einops import rearrange\n\n\ntry:\n import flash_attn_interface\n FLASH_ATTN_3_AVAILABLE = True\ne"
},
{
"path": "diffsynth/core/data/__init__.py",
"chars": 44,
"preview": "from .unified_dataset import UnifiedDataset\n"
},
{
"path": "diffsynth/core/data/operators.py",
"chars": 10383,
"preview": "import math\nimport torch, torchvision, imageio, os\nimport imageio.v3 as iio\nfrom PIL import Image\nimport torchaudio\n\n\ncl"
},
{
"path": "diffsynth/core/data/unified_dataset.py",
"chars": 5091,
"preview": "from .operators import *\nimport torch, json, pandas\n\n\nclass UnifiedDataset(torch.utils.data.Dataset):\n def __init__(\n"
},
{
"path": "diffsynth/core/device/__init__.py",
"chars": 188,
"preview": "from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type, get_device_name\nfro"
},
{
"path": "diffsynth/core/device/npu_compatible_device.py",
"chars": 2877,
"preview": "import importlib\nimport torch\nfrom typing import Any\n\n\ndef is_torch_npu_available():\n return importlib.util.find_spec"
},
{
"path": "diffsynth/core/gradient/__init__.py",
"chars": 61,
"preview": "from .gradient_checkpoint import gradient_checkpoint_forward\n"
},
{
"path": "diffsynth/core/gradient/gradient_checkpoint.py",
"chars": 1854,
"preview": "import torch\n\n\ntry:\n import deepspeed\n _HAS_DEEPSPEED = True\nexcept ModuleNotFoundError:\n _HAS_DEEPSPEED = Fals"
},
{
"path": "diffsynth/core/loader/__init__.py",
"chars": 165,
"preview": "from .file import load_state_dict, hash_state_dict_keys, hash_model_file\nfrom .model import load_model, load_model_with_"
},
{
"path": "diffsynth/core/loader/config.py",
"chars": 5013,
"preview": "import torch, glob, os\nfrom typing import Optional, Union, Dict\nfrom dataclasses import dataclass\nfrom modelscope import"
},
{
"path": "diffsynth/core/loader/file.py",
"chars": 4804,
"preview": "from safetensors import safe_open\nimport torch, hashlib\n\n\ndef load_state_dict(file_path, torch_dtype=None, device=\"cpu\","
},
{
"path": "diffsynth/core/loader/model.py",
"chars": 5295,
"preview": "from ..vram.initialization import skip_model_initialization\nfrom ..vram.disk_map import DiskMap\nfrom ..vram.layers impor"
},
{
"path": "diffsynth/core/npu_patch/npu_fused_operator.py",
"chars": 1245,
"preview": "import torch\nfrom ..device.npu_compatible_device import get_device_type\ntry:\n import torch_npu\nexcept:\n pass\n\n\ndef"
},
{
"path": "diffsynth/core/vram/__init__.py",
"chars": 76,
"preview": "from .initialization import skip_model_initialization\nfrom .layers import *\n"
},
{
"path": "diffsynth/core/vram/disk_map.py",
"chars": 3407,
"preview": "from safetensors import safe_open\nimport torch, os\n\n\nclass SafetensorsCompatibleTensor:\n def __init__(self, tensor):\n"
},
{
"path": "diffsynth/core/vram/initialization.py",
"chars": 766,
"preview": "import torch\nfrom contextlib import contextmanager\n\n\n@contextmanager\ndef skip_model_initialization(device=torch.device(\""
},
{
"path": "diffsynth/core/vram/layers.py",
"chars": 19417,
"preview": "import torch, copy\nfrom typing import Union\nfrom .initialization import skip_model_initialization\nfrom .disk_map import "
},
{
"path": "diffsynth/diffusion/__init__.py",
"chars": 238,
"preview": "from .flow_match import FlowMatchScheduler\nfrom .training_module import DiffusionTrainingModule\nfrom .logger import Mode"
},
{
"path": "diffsynth/diffusion/base_pipeline.py",
"chars": 21809,
"preview": "from PIL import Image\nimport torch\nimport numpy as np\nfrom einops import repeat, reduce\nfrom typing import Union\nfrom .."
},
{
"path": "diffsynth/diffusion/flow_match.py",
"chars": 10309,
"preview": "import torch, math\nfrom typing_extensions import Literal\n\n\nclass FlowMatchScheduler():\n\n def __init__(self, template:"
},
{
"path": "diffsynth/diffusion/logger.py",
"chars": 2158,
"preview": "import os, torch\nfrom accelerate import Accelerator\n\n\nclass ModelLogger:\n def __init__(self, output_path, remove_pref"
},
{
"path": "diffsynth/diffusion/loss.py",
"chars": 8713,
"preview": "from .base_pipeline import BasePipeline\nimport torch\n\n\ndef FlowMatchSFTLoss(pipe: BasePipeline, **inputs):\n max_times"
},
{
"path": "diffsynth/diffusion/parsers.py",
"chars": 5420,
"preview": "import argparse\n\n\ndef add_dataset_base_config(parser: argparse.ArgumentParser):\n parser.add_argument(\"--dataset_base_"
},
{
"path": "diffsynth/diffusion/runner.py",
"chars": 3761,
"preview": "import os, torch\nfrom tqdm import tqdm\nfrom accelerate import Accelerator\nfrom .training_module import DiffusionTraining"
},
{
"path": "diffsynth/diffusion/training_module.py",
"chars": 14276,
"preview": "import torch, json, os, inspect\nfrom ..core import ModelConfig, load_state_dict\nfrom ..utils.controlnet import ControlNe"
},
{
"path": "diffsynth/models/anima_dit.py",
"chars": 58732,
"preview": "# original code from: comfy/ldm/cosmos/predict2.py\n\nimport torch\nfrom torch import nn\nfrom einops import rearrange, repe"
},
{
"path": "diffsynth/models/dinov3_image_encoder.py",
"chars": 3277,
"preview": "from transformers import DINOv3ViTModel, DINOv3ViTImageProcessorFast\nfrom transformers.models.dinov3_vit.modeling_dinov3"
},
{
"path": "diffsynth/models/flux2_dit.py",
"chars": 42155,
"preview": "import inspect\nfrom typing import Any, Dict, List, Optional, Tuple, Union\n\nimport torch, math\nimport torch.nn as nn\nimpo"
},
{
"path": "diffsynth/models/flux2_text_encoder.py",
"chars": 2559,
"preview": "from transformers import Mistral3ForConditionalGeneration, Mistral3Config\n\n\nclass Flux2TextEncoder(Mistral3ForConditiona"
},
{
"path": "diffsynth/models/flux2_vae.py",
"chars": 100582,
"preview": "# Copyright 2025 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "diffsynth/models/flux_controlnet.py",
"chars": 19078,
"preview": "import torch\nfrom einops import rearrange, repeat\nfrom .flux_dit import RoPEEmbedding, TimestepEmbeddings, FluxJointTran"
},
{
"path": "diffsynth/models/flux_dit.py",
"chars": 17397,
"preview": "import torch\nfrom .general_modules import TimestepEmbeddings, AdaLayerNorm, RMSNorm\nfrom einops import rearrange\n\n\ndef i"
},
{
"path": "diffsynth/models/flux_infiniteyou.py",
"chars": 3673,
"preview": "import math\nimport torch\nimport torch.nn as nn\n\n\n# FFN\ndef FeedForward(dim, mult=4):\n inner_dim = int(dim * mult)\n "
},
{
"path": "diffsynth/models/flux_ipadapter.py",
"chars": 4230,
"preview": "from .general_modules import RMSNorm\nfrom transformers import SiglipVisionModel, SiglipVisionConfig\nimport torch\n\n\nclass"
},
{
"path": "diffsynth/models/flux_lora_encoder.py",
"chars": 37187,
"preview": "import torch\nfrom einops import rearrange\n\n\ndef low_version_attention(query, key, value, attn_bias=None):\n scale = 1 "
},
{
"path": "diffsynth/models/flux_lora_patcher.py",
"chars": 20412,
"preview": "import torch, math\nfrom ..core.loader import load_state_dict\nfrom typing import Union\n\nclass GeneralLoRALoader:\n def "
},
{
"path": "diffsynth/models/flux_text_encoder_clip.py",
"chars": 4716,
"preview": "import torch\n\n\nclass Attention(torch.nn.Module):\n\n def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q"
},
{
"path": "diffsynth/models/flux_text_encoder_t5.py",
"chars": 1390,
"preview": "import torch\nfrom transformers import T5EncoderModel, T5Config\n\n\nclass FluxTextEncoderT5(T5EncoderModel):\n def __init"
},
{
"path": "diffsynth/models/flux_vae.py",
"chars": 18636,
"preview": "import torch\nfrom einops import rearrange, repeat\n\n\nclass TileWorker:\n def __init__(self):\n pass\n\n\n def mas"
},
{
"path": "diffsynth/models/flux_value_control.py",
"chars": 2007,
"preview": "import torch\nfrom .general_modules import TemporalTimesteps\n\n\nclass MultiValueEncoder(torch.nn.Module):\n def __init__"
},
{
"path": "diffsynth/models/general_modules.py",
"chars": 5814,
"preview": "import torch, math\n\n\ndef get_timestep_embedding(\n timesteps: torch.Tensor,\n embedding_dim: int,\n flip_sin_to_co"
},
{
"path": "diffsynth/models/longcat_video_dit.py",
"chars": 34046,
"preview": "from typing import List, Optional, Tuple\n\nimport math\nimport torch\nimport torch.nn as nn\nimport torch.amp as amp\n\nimport"
},
{
"path": "diffsynth/models/ltx2_audio_vae.py",
"chars": 72006,
"preview": "from typing import Set, Tuple, Optional, List\nfrom enum import Enum\nimport math\nimport einops\nimport torch\nimport torch."
},
{
"path": "diffsynth/models/ltx2_common.py",
"chars": 13672,
"preview": "from dataclasses import dataclass\nfrom typing import NamedTuple, Protocol, Tuple\nimport torch\nfrom torch import nn\nfrom "
},
{
"path": "diffsynth/models/ltx2_dit.py",
"chars": 67093,
"preview": "import math\nimport functools\nfrom dataclasses import dataclass, replace\nfrom enum import Enum\nfrom typing import Optiona"
},
{
"path": "diffsynth/models/ltx2_text_encoder.py",
"chars": 24498,
"preview": "import math\nimport torch\nimport torch.nn as nn\nfrom einops import rearrange\nfrom transformers import Gemma3ForConditiona"
},
{
"path": "diffsynth/models/ltx2_upsampler.py",
"chars": 12732,
"preview": "import math\nfrom typing import Optional, Tuple\nimport torch\nfrom einops import rearrange\nimport torch.nn.functional as F"
},
{
"path": "diffsynth/models/ltx2_video_vae.py",
"chars": 92925,
"preview": "import itertools\nimport math\nimport einops\nfrom dataclasses import replace, dataclass\nfrom typing import Any, Callable, "
},
{
"path": "diffsynth/models/model_loader.py",
"chars": 5585,
"preview": "from ..core.loader import load_model, hash_model_file\nfrom ..core.vram import AutoWrappedModule\nfrom ..configs import MO"
},
{
"path": "diffsynth/models/mova_audio_dit.py",
"chars": 2247,
"preview": "import torch\nimport torch.nn as nn\nfrom .wan_video_dit import WanModel, precompute_freqs_cis, sinusoidal_embedding_1d\nfr"
},
{
"path": "diffsynth/models/mova_audio_vae.py",
"chars": 26382,
"preview": "import math\nfrom typing import List, Union\nimport numpy as np\nimport torch\nfrom torch import nn\nfrom torch.nn.utils impo"
},
{
"path": "diffsynth/models/mova_dual_tower_bridge.py",
"chars": 26286,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom typing import Dict, List, Tuple, Optional\nfrom e"
},
{
"path": "diffsynth/models/nexus_gen.py",
"chars": 6986,
"preview": "import torch\nfrom PIL import Image\n\n\nclass NexusGenAutoregressiveModel(torch.nn.Module):\n def __init__(self, max_leng"
},
{
"path": "diffsynth/models/nexus_gen_ar_model.py",
"chars": 59159,
"preview": "import os\nimport re\nfrom dataclasses import dataclass\nfrom typing import Any, Dict, List, Optional, Tuple, Union\n\nimport"
},
{
"path": "diffsynth/models/nexus_gen_projector.py",
"chars": 17793,
"preview": "import math\nimport torch\nimport torch.nn as nn\nfrom typing import Optional, Tuple\n\n\n\ndef rotate_half(x):\n \"\"\"Rotates "
},
{
"path": "diffsynth/models/qwen_image_controlnet.py",
"chars": 1712,
"preview": "import torch\nimport torch.nn as nn\nfrom .general_modules import RMSNorm\n\n\nclass BlockWiseControlBlock(torch.nn.Module):\n"
},
{
"path": "diffsynth/models/qwen_image_dit.py",
"chars": 30355,
"preview": "import torch, math, functools\nimport torch.nn as nn\nfrom typing import Tuple, Optional, Union, List\nfrom einops import r"
},
{
"path": "diffsynth/models/qwen_image_image2lora.py",
"chars": 5192,
"preview": "import torch\n\n\nclass CompressedMLP(torch.nn.Module):\n def __init__(self, in_dim, mid_dim, out_dim, bias=False):\n "
},
{
"path": "diffsynth/models/qwen_image_text_encoder.py",
"chars": 6922,
"preview": "import torch\nfrom typing import Optional, Union\n\n\nclass QwenImageTextEncoder(torch.nn.Module):\n def __init__(self):\n "
},
{
"path": "diffsynth/models/qwen_image_vae.py",
"chars": 25151,
"preview": "import torch\nfrom typing import List, Optional, Tuple, Union\nfrom torch import nn\n\n\nCACHE_T = 2\n\nclass QwenImageCausalCo"
},
{
"path": "diffsynth/models/sd_text_encoder.py",
"chars": 32698,
"preview": "import torch\nfrom .attention import Attention\nfrom einops import rearrange\n\n\ndef low_version_attention(query, key, value"
},
{
"path": "diffsynth/models/siglip2_image_encoder.py",
"chars": 4818,
"preview": "from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig\nfrom transformers imp"
},
{
"path": "diffsynth/models/step1x_connector.py",
"chars": 20916,
"preview": "from typing import Optional\n\nimport torch, math\nimport torch.nn\nfrom einops import rearrange\nfrom torch import nn\nfrom f"
},
{
"path": "diffsynth/models/step1x_text_encoder.py",
"chars": 7957,
"preview": "import torch\nfrom typing import Optional, Union\nfrom .qwen_image_text_encoder import QwenImageTextEncoder\nfrom ..core.de"
},
{
"path": "diffsynth/models/wan_video_animate_adapter.py",
"chars": 18735,
"preview": "import torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nimport math\nfrom typing import Tuple, Optional, "
},
{
"path": "diffsynth/models/wan_video_camera_controller.py",
"chars": 7576,
"preview": "import torch\nimport torch.nn as nn\nimport numpy as np\nfrom einops import rearrange\nimport os\nfrom typing_extensions impo"
},
{
"path": "diffsynth/models/wan_video_dit.py",
"chars": 22421,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport math\nfrom typing import Tuple, Optional\nfrom e"
},
{
"path": "diffsynth/models/wan_video_dit_s2v.py",
"chars": 25071,
"preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom typing import Tuple\nfrom .wan"
},
{
"path": "diffsynth/models/wan_video_image_encoder.py",
"chars": 27486,
"preview": "\"\"\"\nConcise re-implementation of\n``https://github.com/openai/CLIP'' and\n``https://github.com/mlfoundations/open_clip''.\n"
},
{
"path": "diffsynth/models/wan_video_mot.py",
"chars": 7169,
"preview": "import torch\nfrom .wan_video_dit import DiTBlock, SelfAttention, rope_apply, flash_attention, modulate, MLP\nimport einop"
},
{
"path": "diffsynth/models/wan_video_motion_controller.py",
"chars": 802,
"preview": "import torch\nimport torch.nn as nn\nfrom .wan_video_dit import sinusoidal_embedding_1d\n\n\n\nclass WanMotionControllerModel("
},
{
"path": "diffsynth/models/wan_video_text_encoder.py",
"chars": 11121,
"preview": "import math\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom transformers import AutoTokenizer\nim"
},
{
"path": "diffsynth/models/wan_video_vace.py",
"chars": 2577,
"preview": "import torch\nfrom .wan_video_dit import DiTBlock\nfrom ..core.gradient import gradient_checkpoint_forward\n\nclass VaceWanA"
},
{
"path": "diffsynth/models/wan_video_vae.py",
"chars": 50487,
"preview": "from einops import rearrange, repeat\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom tqdm impor"
},
{
"path": "diffsynth/models/wantodance.py",
"chars": 6588,
"preview": "from inspect import isfunction\nfrom math import log, pi\n\nimport torch\nfrom einops import rearrange, repeat\nfrom torch im"
},
{
"path": "diffsynth/models/wav2vec.py",
"chars": 8520,
"preview": "import math\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\n\n\ndef get_sample_indices(original_fps, total"
},
{
"path": "diffsynth/models/z_image_controlnet.py",
"chars": 5969,
"preview": "from .z_image_dit import ZImageTransformerBlock\nfrom ..core.gradient import gradient_checkpoint_forward\nfrom torch.nn.ut"
},
{
"path": "diffsynth/models/z_image_dit.py",
"chars": 45260,
"preview": "import math\nfrom typing import List, Optional, Tuple\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F"
},
{
"path": "diffsynth/models/z_image_image2lora.py",
"chars": 7636,
"preview": "import torch\nfrom .qwen_image_image2lora import ImageEmbeddingToLoraMatrix, SequencialMLP\n\n\nclass LoRATrainerBlock(torch"
},
{
"path": "diffsynth/models/z_image_text_encoder.py",
"chars": 3875,
"preview": "from transformers import Qwen3Model, Qwen3Config\nimport torch\n\n\nclass ZImageTextEncoder(torch.nn.Module):\n def __init"
},
{
"path": "diffsynth/pipelines/anima_image.py",
"chars": 9690,
"preview": "import torch, math\nfrom PIL import Image\nfrom typing import Union\nfrom tqdm import tqdm\nfrom einops import rearrange\nimp"
},
{
"path": "diffsynth/pipelines/flux2_image.py",
"chars": 22583,
"preview": "import torch, math, torchvision\nfrom PIL import Image\nfrom typing import Union\nfrom tqdm import tqdm\nfrom einops import "
},
{
"path": "diffsynth/pipelines/flux_image.py",
"chars": 57823,
"preview": "import torch, math\nfrom PIL import Image\nfrom typing import Union\nfrom tqdm import tqdm\nfrom einops import rearrange, re"
},
{
"path": "diffsynth/pipelines/ltx2_audio_video.py",
"chars": 41471,
"preview": "import torch, types\nimport numpy as np\nfrom PIL import Image\nfrom einops import repeat\nfrom typing import Optional, Unio"
},
{
"path": "diffsynth/pipelines/mova_audio_video.py",
"chars": 20436,
"preview": "import sys\nimport torch, types\nfrom PIL import Image\nfrom typing import Optional, Union\nfrom einops import rearrange\nimp"
},
{
"path": "diffsynth/pipelines/qwen_image.py",
"chars": 42659,
"preview": "import torch, math\nfrom PIL import Image\nfrom typing import Union\nfrom tqdm import tqdm\nfrom einops import rearrange\nimp"
},
{
"path": "diffsynth/pipelines/wan_video.py",
"chars": 85913,
"preview": "import torch, types\nimport numpy as np\nfrom PIL import Image\nfrom einops import repeat\nfrom typing import Optional, Unio"
},
{
"path": "diffsynth/pipelines/z_image.py",
"chars": 27409,
"preview": "import torch, math, warnings\nfrom PIL import Image\nfrom typing import Union\nfrom tqdm import tqdm\nfrom einops import rea"
},
{
"path": "diffsynth/utils/controlnet/__init__.py",
"chars": 79,
"preview": "from .controlnet_input import ControlNetInput\nfrom .annotator import Annotator\n"
},
{
"path": "diffsynth/utils/controlnet/annotator.py",
"chars": 3035,
"preview": "from typing_extensions import Literal, TypeAlias\n\nfrom diffsynth.core.device.npu_compatible_device import get_device_typ"
},
{
"path": "diffsynth/utils/controlnet/controlnet_input.py",
"chars": 320,
"preview": "from dataclasses import dataclass\nfrom PIL import Image\n\n\n@dataclass\nclass ControlNetInput:\n controlnet_id: int = 0\n "
},
{
"path": "diffsynth/utils/data/__init__.py",
"chars": 6913,
"preview": "import imageio, os\nimport numpy as np\nfrom PIL import Image\nfrom tqdm import tqdm\nimport subprocess\nimport shutil\n\n\nclas"
},
{
"path": "diffsynth/utils/data/audio.py",
"chars": 4358,
"preview": "import torch\nimport torchaudio\n\n\ndef convert_to_mono(audio_tensor: torch.Tensor) -> torch.Tensor:\n \"\"\"\n Convert au"
},
{
"path": "diffsynth/utils/data/audio_video.py",
"chars": 5141,
"preview": "import av\nfrom fractions import Fraction\nimport torch\nfrom PIL import Image\nfrom tqdm import tqdm\nfrom .audio import con"
},
{
"path": "diffsynth/utils/data/media_io_ltx2.py",
"chars": 1557,
"preview": "import av\nimport numpy as np\nfrom io import BytesIO\nfrom .audio_video import write_video_audio as write_video_audio_ltx2"
},
{
"path": "diffsynth/utils/lora/__init__.py",
"chars": 108,
"preview": "from .general import GeneralLoRALoader\nfrom .merge import merge_lora\nfrom .reset_rank import reset_lora_rank"
},
{
"path": "diffsynth/utils/lora/flux.py",
"chars": 21357,
"preview": "from .general import GeneralLoRALoader\nimport torch, math\n\n\nclass FluxLoRALoader(GeneralLoRALoader):\n def __init__(se"
},
{
"path": "diffsynth/utils/lora/general.py",
"chars": 3516,
"preview": "import torch, warnings\n\n\nclass GeneralLoRALoader:\n def __init__(self, device=\"cpu\", torch_dtype=torch.float32):\n "
},
{
"path": "diffsynth/utils/lora/merge.py",
"chars": 698,
"preview": "import torch\nfrom typing import Dict, List\n\n\ndef merge_lora_weight(tensors_A, tensors_B):\n lora_A = torch.concat(tens"
},
{
"path": "diffsynth/utils/lora/reset_rank.py",
"chars": 786,
"preview": "import torch\n\ndef decomposite(tensor_A, tensor_B, rank):\n dtype, device = tensor_A.dtype, tensor_A.device\n weight "
},
{
"path": "diffsynth/utils/ses/README.md",
"chars": 140,
"preview": "Please see `docs/en/Research_Tutorial/inference_time_scaling.md` or `docs/zh/Research_Tutorial/inference_time_scaling.md"
},
{
"path": "diffsynth/utils/ses/__init__.py",
"chars": 27,
"preview": "from .ses import ses_search"
},
{
"path": "diffsynth/utils/ses/ses.py",
"chars": 4177,
"preview": "import torch\nimport pywt\nimport numpy as np\nfrom tqdm import tqdm\n\n\ndef split_dwt(z_tensor_cpu, wavelet_name, dwt_level)"
},
{
"path": "diffsynth/utils/state_dict_converters/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "diffsynth/utils/state_dict_converters/anima_dit.py",
"chars": 209,
"preview": "def AnimaDiTStateDictConverter(state_dict):\n new_state_dict = {}\n for key in state_dict:\n value = state_dic"
},
{
"path": "diffsynth/utils/state_dict_converters/flux2_text_encoder.py",
"chars": 870,
"preview": "def Flux2TextEncoderStateDictConverter(state_dict):\n rename_dict = {\n \"multi_modal_projector.linear_1.weight\":"
},
{
"path": "diffsynth/utils/state_dict_converters/flux_controlnet.py",
"chars": 4859,
"preview": "import torch\n\n\ndef FluxControlNetStateDictConverter(state_dict):\n global_rename_dict = {\n \"context_embedder\": "
},
{
"path": "diffsynth/utils/state_dict_converters/flux_dit.py",
"chars": 9533,
"preview": "import torch\n\n\ndef FluxDiTStateDictConverter(state_dict):\n is_nexus_gen = sum([key.startswith(\"pipe.dit.\") for key in"
},
{
"path": "diffsynth/utils/state_dict_converters/flux_infiniteyou.py",
"chars": 100,
"preview": "def FluxInfiniteYouImageProjectorStateDictConverter(state_dict):\n return state_dict['image_proj']"
},
{
"path": "diffsynth/utils/state_dict_converters/flux_ipadapter.py",
"chars": 1093,
"preview": "def FluxIpAdapterStateDictConverter(state_dict):\n state_dict_ = {}\n \n if \"ip_adapter\" in state_dict and isinsta"
},
{
"path": "diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py",
"chars": 1396,
"preview": "def FluxTextEncoderClipStateDictConverter(state_dict):\n rename_dict = {\n \"text_model.embeddings.token_embeddin"
},
{
"path": "diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py",
"chars": 210,
"preview": "def FluxTextEncoderT5StateDictConverter(state_dict):\n state_dict_ = {i: state_dict[i] for i in state_dict}\n state_"
},
{
"path": "diffsynth/utils/state_dict_converters/flux_vae.py",
"chars": 23589,
"preview": "def FluxVAEEncoderStateDictConverter(state_dict):\n rename_dict = {\n \"encoder.conv_in.bias\": \"conv_in.bias\",\n "
},
{
"path": "diffsynth/utils/state_dict_converters/ltx2_audio_vae.py",
"chars": 1295,
"preview": "def LTX2AudioEncoderStateDictConverter(state_dict):\n # Not used\n state_dict_ = {}\n for name in state_dict:\n "
},
{
"path": "diffsynth/utils/state_dict_converters/ltx2_dit.py",
"chars": 435,
"preview": "def LTXModelStateDictConverter(state_dict):\n state_dict_ = {}\n for name in state_dict:\n if name.startswith("
},
{
"path": "diffsynth/utils/state_dict_converters/ltx2_text_encoder.py",
"chars": 1593,
"preview": "def LTX2TextEncoderStateDictConverter(state_dict):\n state_dict_ = {}\n for key in state_dict:\n if key.starts"
},
{
"path": "diffsynth/utils/state_dict_converters/ltx2_video_vae.py",
"chars": 1306,
"preview": "def LTX2VideoEncoderStateDictConverter(state_dict):\n state_dict_ = {}\n for name in state_dict:\n if name.sta"
},
{
"path": "diffsynth/utils/state_dict_converters/nexus_gen.py",
"chars": 218,
"preview": "def NexusGenAutoregressiveModelStateDictConverter(state_dict):\n new_state_dict = {}\n for key in state_dict:\n "
},
{
"path": "diffsynth/utils/state_dict_converters/nexus_gen_projector.py",
"chars": 552,
"preview": "def NexusGenMergerStateDictConverter(state_dict):\n merger_state_dict = {}\n for key in state_dict:\n if key.s"
},
{
"path": "diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py",
"chars": 341,
"preview": "def QwenImageTextEncoderStateDictConverter(state_dict):\n state_dict_ = {}\n for k in state_dict:\n v = state_"
},
{
"path": "diffsynth/utils/state_dict_converters/step1x_connector.py",
"chars": 258,
"preview": "def Qwen2ConnectorStateDictConverter(state_dict):\n state_dict_ = {}\n for name in state_dict:\n if name.start"
},
{
"path": "diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py",
"chars": 333,
"preview": "def WanAnimateAdapterStateDictConverter(state_dict):\n state_dict_ = {}\n for name in state_dict:\n if name.st"
},
{
"path": "diffsynth/utils/state_dict_converters/wan_video_dit.py",
"chars": 4980,
"preview": "def WanVideoDiTFromDiffusers(state_dict):\n rename_dict = {\n \"blocks.0.attn1.norm_k.weight\": \"blocks.0.self_att"
},
{
"path": "diffsynth/utils/state_dict_converters/wan_video_image_encoder.py",
"chars": 261,
"preview": "def WanImageEncoderStateDictConverter(state_dict):\n state_dict_ = {}\n for name in state_dict:\n if name.star"
},
{
"path": "diffsynth/utils/state_dict_converters/wan_video_mot.py",
"chars": 4942,
"preview": "def WanVideoMotStateDictConverter(state_dict):\n rename_dict = {\n \"blocks.0.attn1.norm_k.weight\": \"blocks.0.sel"
},
{
"path": "diffsynth/utils/state_dict_converters/wan_video_vace.py",
"chars": 159,
"preview": "def VaceWanModelDictConverter(state_dict):\n state_dict_ = {name: state_dict[name] for name in state_dict if name.star"
},
{
"path": "diffsynth/utils/state_dict_converters/wan_video_vae.py",
"chars": 257,
"preview": "def WanVideoVAEStateDictConverter(state_dict):\n state_dict_ = {}\n if 'model_state' in state_dict:\n state_di"
},
{
"path": "diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py",
"chars": 588,
"preview": "def WanS2VAudioEncoderStateDictConverter(state_dict):\n rename_dict = {\n \"model.wav2vec2.encoder.pos_conv_embed"
},
{
"path": "diffsynth/utils/state_dict_converters/z_image_text_encoder.py",
"chars": 210,
"preview": "def ZImageTextEncoderStateDictConverter(state_dict):\n state_dict_ = {}\n for name in state_dict:\n if name !="
},
{
"path": "diffsynth/utils/xfuser/__init__.py",
"chars": 157,
"preview": "from .xdit_context_parallel import usp_attn_forward, usp_dit_forward, get_sequence_parallel_world_size, initialize_usp, "
},
{
"path": "diffsynth/utils/xfuser/xdit_context_parallel.py",
"chars": 6210,
"preview": "import torch\nfrom typing import Optional\nfrom einops import rearrange\nfrom yunchang.kernels import AttnType\nfrom xfuser."
},
{
"path": "diffsynth/version.py",
"chars": 271,
"preview": "# Make sure to modify __release_datetime__ to release time when making official release.\n__version__ = '2.0.0'\n# default"
},
{
"path": "docs/en/.readthedocs.yaml",
"chars": 717,
"preview": "# .readthedocs.yaml\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html f"
},
{
"path": "docs/en/API_Reference/core/attention.md",
"chars": 5408,
"preview": "# `diffsynth.core.attention`: Attention Mechanism Implementation\n\n`diffsynth.core.attention` provides routing mechanisms"
},
{
"path": "docs/en/API_Reference/core/data.md",
"chars": 7157,
"preview": "# `diffsynth.core.data`: Data Processing Operators and Universal Dataset\n\n## Data Processing Operators\n\n### Available Da"
},
{
"path": "docs/en/API_Reference/core/gradient.md",
"chars": 4232,
"preview": "# `diffsynth.core.gradient`: Gradient Checkpointing and Offload\n\n`diffsynth.core.gradient` provides encapsulated gradien"
},
{
"path": "docs/en/API_Reference/core/loader.md",
"chars": 6653,
"preview": "# `diffsynth.core.loader`: Model Download and Loading\n\nThis document introduces the model download and loading functiona"
},
{
"path": "docs/en/API_Reference/core/vram.md",
"chars": 3553,
"preview": "# `diffsynth.core.vram`: VRAM Management\n\nThis document introduces the underlying VRAM management functionalities in `di"
},
{
"path": "docs/en/Developer_Guide/Building_a_Pipeline.md",
"chars": 12419,
"preview": "# Building a Pipeline\n\nAfter [integrating the required models for the Pipeline](../Developer_Guide/Integrating_Your_Mode"
},
{
"path": "docs/en/Developer_Guide/Enabling_VRAM_management.md",
"chars": 19108,
"preview": "# Fine-Grained VRAM Management Scheme\n\nThis document introduces how to write reasonable fine-grained VRAM management sch"
},
{
"path": "docs/en/Developer_Guide/Integrating_Your_Model.md",
"chars": 11109,
"preview": "# Integrating Model Architecture\n\nThis document introduces how to integrate models into the `DiffSynth-Studio` framework"
},
{
"path": "docs/en/Developer_Guide/Training_Diffusion_Models.md",
"chars": 2849,
"preview": "# Integrating Model Training\n\nAfter [integrating models](../Developer_Guide/Integrating_Your_Model.md) and [implementing"
},
{
"path": "docs/en/Makefile",
"chars": 633,
"preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
},
{
"path": "docs/en/Model_Details/Anima.md",
"chars": 7814,
"preview": "# Anima\n\nAnima is an image generation model trained and open-sourced by CircleStone Labs and Comfy Org.\n\n## Installation"
},
{
"path": "docs/en/Model_Details/FLUX.md",
"chars": 23889,
"preview": "# FLUX\n\n\n\nFLUX is an image gene"
},
{
"path": "docs/en/Model_Details/FLUX2.md",
"chars": 13765,
"preview": "# FLUX.2\n\nFLUX.2 is an image generation model trained and open-sourced by Black Forest Labs.\n\n## Model Lineage\n\n```merma"
},
{
"path": "docs/en/Model_Details/LTX-2.md",
"chars": 26246,
"preview": "# LTX-2\n\nLTX-2 is a series of audio-video generation models developed by Lightricks.\n\n## Installation\n\nBefore using this"
},
{
"path": "docs/en/Model_Details/Overview.md",
"chars": 55595,
"preview": "# Model Directory\n\n## Qwen-Image\n\nDocumentation: [./Qwen-Image.md](../Model_Details/Qwen-Image.md)\n\n<details>\n\n<summary>"
},
{
"path": "docs/en/Model_Details/Qwen-Image.md",
"chars": 29964,
"preview": "# Qwen-Image\n\n\n\nQwen-Image is a"
},
{
"path": "docs/en/Model_Details/Wan.md",
"chars": 49009,
"preview": "# Wan\n\nhttps://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314\n\nWan is a video generation model "
},
{
"path": "docs/en/Model_Details/Z-Image.md",
"chars": 16424,
"preview": "# Z-Image\n\nZ-Image is an image generation model trained and open-sourced by the Multimodal Interaction Team of Alibaba T"
},
{
"path": "docs/en/Pipeline_Usage/Environment_Variables.md",
"chars": 1731,
"preview": "# Environment Variables\n\n`DiffSynth-Studio` can control some settings through environment variables.\n\nIn `Python` code, "
},
{
"path": "docs/en/Pipeline_Usage/GPU_support.md",
"chars": 4822,
"preview": "# GPU/NPU Support\n\n`DiffSynth-Studio` supports various GPUs and NPUs. This document explains how to run model inference "
},
{
"path": "docs/en/Pipeline_Usage/Model_Inference.md",
"chars": 7522,
"preview": "# Model Inference\n\nThis document uses the Qwen-Image model as an example to introduce how to use `DiffSynth-Studio` for "
},
{
"path": "docs/en/Pipeline_Usage/Model_Training.md",
"chars": 18558,
"preview": "# Model Training\n\nThis document introduces how to use `DiffSynth-Studio` for model training.\n\n## Script Parameters\n\nTrai"
},
{
"path": "docs/en/Pipeline_Usage/Setup.md",
"chars": 1688,
"preview": "# Installing Dependencies\n\nInstall from source (recommended):\n\n```\ngit clone https://github.com/modelscope/DiffSynth-Stu"
},
{
"path": "docs/en/Pipeline_Usage/VRAM_management.md",
"chars": 11375,
"preview": "# VRAM Management\n\nVRAM management is a distinctive feature of `DiffSynth-Studio` that enables GPUs with low VRAM to run"
},
{
"path": "docs/en/QA.md",
"chars": 3545,
"preview": "# Frequently Asked Questions\n\n## Why doesn't the training framework support batch size > 1?\n\n* **Larger batch sizes no l"
},
{
"path": "docs/en/README.md",
"chars": 5199,
"preview": "# DiffSynth-Studio Documentation\n\nWelcome to the magical world of Diffusion models! `DiffSynth-Studio` is an open-source"
},
{
"path": "docs/en/Research_Tutorial/inference_time_scaling.ipynb",
"chars": 10626,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"8db54992\",\n \"metadata\": {},\n \"source\": [\n \"# Inference Op"
},
{
"path": "docs/en/Research_Tutorial/inference_time_scaling.md",
"chars": 8195,
"preview": "# Inference Optimization Techniques\n\nDiffSynth-Studio aims to drive technological innovation through its foundational fr"
},
{
"path": "docs/en/Research_Tutorial/train_from_scratch.md",
"chars": 20828,
"preview": "# Training Models from Scratch\n\nDiffSynth-Studio's training engine supports training foundation models from scratch. Thi"
},
{
"path": "docs/en/Research_Tutorial/train_from_scratch.py",
"chars": 13659,
"preview": "import torch, accelerate\nfrom PIL import Image\nfrom typing import Union\nfrom tqdm import tqdm\nfrom einops import rearran"
},
{
"path": "docs/en/Training/Differential_LoRA.md",
"chars": 3032,
"preview": "# Differential LoRA Training\n\nDifferential LoRA training is a special form of LoRA training designed to enable models to"
},
{
"path": "docs/en/Training/Direct_Distill.md",
"chars": 5724,
"preview": "# End-to-End Distillation Accelerated Training\n\n## Distillation Accelerated Training\n\nThe inference process of Diffusion"
},
{
"path": "docs/en/Training/FP8_Precision.md",
"chars": 2461,
"preview": "# Enabling FP8 Precision in Training\n\nAlthough `DiffSynth-Studio` supports [VRAM management](../Pipeline_Usage/VRAM_mana"
},
{
"path": "docs/en/Training/Split_Training.md",
"chars": 5408,
"preview": "# Two-Stage Split Training\n\nThis document introduces split training, which can automatically divide the training process"
},
{
"path": "docs/en/Training/Supervised_Fine_Tuning.md",
"chars": 6605,
"preview": "# Standard Supervised Training\n\nAfter understanding the [Basic Principles of Diffusion Models](../Training/Understanding"
},
{
"path": "docs/en/Training/Understanding_Diffusion_models.md",
"chars": 9611,
"preview": "# Basic Principles of Diffusion Models\n\nThis document introduces the basic principles of Diffusion models to help you un"
},
{
"path": "docs/en/conf.py",
"chars": 4172,
"preview": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common op"
},
{
"path": "docs/en/index.rst",
"chars": 1621,
"preview": "Welcome to DiffSynth-Studio's Documentation\n==========================================\n\n.. toctree::\n :maxdepth: 2\n "
},
{
"path": "docs/requirements.txt",
"chars": 189,
"preview": "docutils>=0.16.0\nmyst_parser\nrecommonmark\nsphinx>=5.3.0\nsphinx-book-theme\nsphinx-copybutton\nsphinx-autobuild\nsphinx-rtd-"
},
{
"path": "docs/zh/.readthedocs.yaml",
"chars": 717,
"preview": "# .readthedocs.yaml\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html f"
},
{
"path": "docs/zh/API_Reference/core/attention.md",
"chars": 3410,
"preview": "# `diffsynth.core.attention`: 注意力机制实现\n\n`diffsynth.core.attention` 提供了注意力机制实现的路由机制,根据 `Python` 环境中的可用包和[环境变量](../../Pipel"
},
{
"path": "docs/zh/API_Reference/core/data.md",
"chars": 4403,
"preview": "# `diffsynth.core.data`: 数据处理算子与通用数据集\n\n## 数据处理算子\n\n### 可用数据处理算子\n\n`diffsynth.core.data` 提供了一系列数据处理算子,用于进行数据处理,包括:\n\n* 数据格式转"
},
{
"path": "docs/zh/API_Reference/core/gradient.md",
"chars": 2311,
"preview": "# `diffsynth.core.gradient`: 梯度检查点及其 Offload\n\n`diffsynth.core.gradient` 中提供了封装好的梯度检查点及其 Offload 版本,用于模型训练。\n\n## 梯度检查点\n\n梯度"
},
{
"path": "docs/zh/API_Reference/core/loader.md",
"chars": 4842,
"preview": "# `diffsynth.core.loader`: 模型下载与加载\n\n本文档介绍 `diffsynth.core.loader` 中模型下载与加载相关的功能。\n\n## ModelConfig\n\n`diffsynth.core.loader"
},
{
"path": "docs/zh/API_Reference/core/vram.md",
"chars": 2338,
"preview": "# `diffsynth.core.vram`: 显存管理\n\n本文档介绍 `diffsynth.core.vram` 中的显存管理底层功能,如果你希望将这些功能用于其他的代码库中,可参考本文档。\n\n## 跳过模型参数初始化\n\n在 `PyTo"
},
{
"path": "docs/zh/Developer_Guide/Building_a_Pipeline.md",
"chars": 8938,
"preview": "# 接入 Pipeline\n\n在[将 Pipeline 所需的模型接入](../Developer_Guide/Integrating_Your_Model.md)之后,还需构建 `Pipeline` 用于模型推理,本文档提供 `Pipel"
},
{
"path": "docs/zh/Developer_Guide/Enabling_VRAM_management.md",
"chars": 8170,
"preview": "# 细粒度显存管理方案\n\n本文档介绍如何为模型编写合理的细粒度显存管理方案,以及如何将 `DiffSynth-Studio` 中的显存管理功能用于外部的其他代码库,在阅读本文档前,请先阅读文档[显存管理](../Pipeline_Usage"
},
{
"path": "docs/zh/Developer_Guide/Integrating_Your_Model.md",
"chars": 7564,
"preview": "# 接入模型结构\n\n本文档介绍如何将模型接入到 `DiffSynth-Studio` 框架中,供 `Pipeline` 等模块调用。\n\n## Step 1: 集成模型结构代码\n\n`DiffSynth-Studio` 中的所有模型结构实现统一"
},
{
"path": "docs/zh/Developer_Guide/Training_Diffusion_Models.md",
"chars": 2098,
"preview": "# 接入模型训练\n\n在[接入模型](../Developer_Guide/Integrating_Your_Model.md)并[实现 Pipeline](../Developer_Guide/Building_a_Pipeline.md)"
},
{
"path": "docs/zh/Makefile",
"chars": 633,
"preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
},
{
"path": "docs/zh/Model_Details/Anima.md",
"chars": 6774,
"preview": "# Anima\n\nAnima 是由 CircleStone Labs 与 Comfy Org 训练并开源的图像生成模型。\n\n## 安装\n\n在使用本项目进行模型推理和训练前,请先安装 DiffSynth-Studio。\n\n```shell\ng"
},
{
"path": "docs/zh/Model_Details/FLUX.md",
"chars": 19687,
"preview": "# FLUX\n\n\n\nFLUX 是由 Black Forest "
},
{
"path": "docs/zh/Model_Details/FLUX2.md",
"chars": 10240,
"preview": "# FLUX.2\n\nFLUX.2 是由 Black Forest Labs 训练并开源的图像生成模型。\n\n## 模型血缘\n\n```mermaid\ngraph LR;\n FLUX.2-Series-->black-forest-labs"
},
{
"path": "docs/zh/Model_Details/LTX-2.md",
"chars": 22495,
"preview": "# LTX-2\n\nLTX-2 是由 Lightricks 开发的音视频生成模型系列。\n\n## 安装\n\n在使用本项目进行模型推理和训练前,请先安装 DiffSynth-Studio。\n\n```shell\ngit clone https://g"
},
{
"path": "docs/zh/Model_Details/Overview.md",
"chars": 53781,
"preview": "# 模型目录\n\n## Qwen-Image\n\n文档:[./Qwen-Image.md](../Model_Details/Qwen-Image.md)\n\n<details>\n\n<summary>效果一览</summary>\n\n\n\nQwen-Image 是由阿里"
}
]
// ... and 641 more files (download for full content)
About this extraction
This page contains the full source code of the modelscope/DiffSynth-Studio GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 841 files (3.9 MB), approximately 1.1M tokens, and a symbol index with 2502 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.